def nexus_express(**set_up_kwargs): # chose sample set from data # function: choose_samples() select_samples_from = set_up_kwargs.get('select_samples_from', None) select_samples_which = parse_arg_type( set_up_kwargs.get('select_samples_which', None), int) select_samples_sort_by = set_up_kwargs.get('select_samples_sort_by', None) if select_samples_sort_by is not None: select_samples_sort_by = select_samples_sort_by.rsplit(',') select_samples_title = set_up_kwargs.get('select_samples_title', 'select_all') clinical_label = select_samples_sort_by[0] class_labels = set_up_kwargs.get('class_labels', None) if class_labels is not None: if ',' in class_labels: class_labels = class_labels.rsplit(',') class_values = set_up_kwargs.get('class_values', None) if class_values is not None: if ',' in class_values: class_values = class_values.rsplit(',') class_values = np.array(class_values).astype(int) # initialize script params saveReport = parse_arg_type(set_up_kwargs.get('saveReport', False), bool) toPrint = parse_arg_type(set_up_kwargs.get('toPrint', False), bool) toPlotFreq = parse_arg_type(set_up_kwargs.get('toPlotFreq', True), bool) reportName = set_up_kwargs.get('reportName', script_fname) txt_label = set_up_kwargs.get('txt_label', 'test_txt_label') input_fname = set_up_kwargs.get('input_fname', 'data_processed.csv') gene_info_fname = set_up_kwargs.get('gene_info_fname', None) chr_col = set_up_kwargs.get('chr_col', 'chr_int') gene_id_col = set_up_kwargs.get('gene_id_col', 'gene') sample_info_fname = set_up_kwargs.get('sample_info_fname', None) if ',' in sample_info_fname: sample_info_fname = os.path.join(*sample_info_fname.rsplit(',')) sample_info_read_csv_kwargs = set_up_kwargs.get( 'sample_info_read_csv_kwargs', {}) data_uniq_fname = input_fname.rsplit('.')[0]+'__' + \ select_samples_title+'__uniq' toRemoveDupl = parse_arg_type(set_up_kwargs.get('toRemoveDupl', True), bool) # params for diff analysis min_diff_thres = parse_arg_type(set_up_kwargs.get('min_diff_thres', 0.25), float) multtest_alpha = parse_arg_type(set_up_kwargs.get('multtest_alpha', 0.05), float) with_perc = parse_arg_type(set_up_kwargs.get('with_perc', 100), int) multtest_method = set_up_kwargs.get('multtest_method', 'fdr_bh') # plotting params plot_kwargs = set_up_kwargs.get('plot_kwargs', {}) function_dict = plot_kwargs.get('function_dict', None) cmap_custom = plot_kwargs.get('cmap_custom', None) vmin = parse_arg_type(plot_kwargs.get('vmin', None), int) vmax = parse_arg_type(plot_kwargs.get('vmax', None), int) if (cmap_custom is None) and (vmin is not None) and (vmax is not None): custom_div_cmap_arg = abs(vmin) + abs(vmax) if (vmin <= 0) and (vmax >= 0): custom_div_cmap_arg = custom_div_cmap_arg + 1 mincol = plot_kwargs.get('mincol', None) midcol = plot_kwargs.get('midcol', None) maxcol = plot_kwargs.get('maxcol', None) if ((mincol is not None) and (midcol is not None) and (maxcol is not None)): cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg, mincol=mincol, midcol=midcol, maxcol=maxcol) else: cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg) highRes = parse_arg_type(plot_kwargs.get('highRes', False), bool) if highRes: img_ext = '.pdf' else: img_ext = '.png' # initialize directories MainDataDir = set_path(os.path.join(script_path, '..', 'data')) # data input input_directory = set_up_kwargs.get('input_directory') if ',' in input_directory: input_directory = os.path.join(*input_directory.rsplit(',')) input_directory = os.path.join(MainDataDir, input_directory) # sample info input sample_info_directory = set_up_kwargs.get('sample_info_directory') if ',' in sample_info_directory: sample_info_directory = os.path.join( *sample_info_directory.rsplit(',')) sample_info_directory = os.path.join(MainDataDir, sample_info_directory) # gene info input gene_info_directory = set_up_kwargs.get('gene_info_directory') if gene_info_directory is None: gene_info_directory = input_directory else: if ',' in gene_info_directory: gene_info_directory = os.path.join( *gene_info_directory.rsplit(',')) gene_info_directory = os.path.join(MainDataDir, gene_info_directory) # dupl_genes input dupl_genes_directory = set_up_kwargs.get('dupl_genes_directory') dupl_genes_directory = os.path.join(input_directory, dupl_genes_directory) # data output output_directory = set_up_kwargs.get('output_directory') if output_directory is None: output_directory = set_directory( os.path.join(input_directory, reportName)) else: if ',' in output_directory: output_directory = os.path.join(*output_directory.rsplit(',')) output_directory = set_directory( os.path.join(MainDataDir, output_directory, reportName)) # save the set_up_kwargs in the output dir for reproducibility fname = 'set_up_kwargs.json' f = os.path.join(output_directory, fname) if toPrint: logger.info('-save set_up_kwargs dictionary for reproducibility in: ' + f) with open(f, 'w') as fp: json.dump(set_up_kwargs, fp, indent=4) # load info table of samples if toPrint: logger.info('Load info table of samples') fpath = os.path.join(sample_info_directory, sample_info_fname) info_table = load_clinical(fpath, **sample_info_read_csv_kwargs) # load processed data fpath = os.path.join(input_directory, input_fname) data = pd.read_csv(fpath, sep='\t', header=0, index_col=0) empty_pat = data.sum(axis=1).isnull() if empty_pat.any(): logger.info('Patients with missing values in all genes: ' + str(data.index[empty_pat])) data = data.fillna(0) # load gene info if gene_info_fname is not None: fpath = os.path.join(gene_info_directory, gene_info_fname) genes_positions_table = pd.read_csv(fpath, sep='\t', header=0, index_col=0) # get gene chrom position xlabels, xpos = get_chr_ticks(genes_positions_table, data, id_col='gene', chr_col=chr_col) else: xlabels, xpos = None, None # select the samples for the chosen comparison (e.g. all, only TP53wt, etc) logger.info('select_samples_from: ' + str(select_samples_from) + ', select_samples_which: ' + str(select_samples_which) + ', select_samples_sort_by: ' + str(select_samples_sort_by) + ', select_samples_title: ' + str(select_samples_title)) # keep only info_table with data temp = info_table.index.name info_table = info_table.loc[data.index].copy() info_table.index.name = temp ids_tmp = choose_samples(info_table.reset_index(), info_table.index.name, choose_from=select_samples_from, choose_what=select_samples_which, sortby=select_samples_sort_by, ascending=False) # keep a subpart of the info_table (rows and columns) info_table = info_table.loc[ids_tmp, select_samples_sort_by].copy() # keep only these samples from the data data = data.loc[ids_tmp, :].copy() try: pat_labels_txt = info_table.astype(int).reset_index().values except: pat_labels_txt = info_table.reset_index().values pat_labels_title = str(info_table.reset_index().columns.values) # plot CNV frequencies of all samples data_ampl, data_del = _get_ampl_del_from_data(data) if toPlotFreq: _plot_oncoscan_frequency_plot(data_ampl, data_del, select_samples_title, '', gene_info_fname, xlabels, xpos, saveReport, img_ext, output_directory) extra_label = '' if toRemoveDupl: # keep a copy of the data with duplicate genes data_wDupl = data.copy() xlabels_wDupl = xlabels.copy() xpos_wDupl = xpos.copy() data_ampl_wDupl, data_del_wDupl = data_ampl.copy(), data_del.copy() # load data with uniq genes (this will be the default data from now on) fpath = os.path.join(dupl_genes_directory, data_uniq_fname + '.txt') if not os.path.exists(fpath): logger.warning('The data_uniq file does not exist, ' + 'the analysis will run on the processed data ' + 'only!\nfile path:\n' + fpath) toRemoveDupl = False else: extra_label = '_uniq' data = pd.read_csv(fpath, sep='\t', header=0, index_col=0) data = data.fillna(0) # keep the same samples as before data = data.loc[data_wDupl.index, :].copy() # get gene chrom position if gene_info_fname is not None: xlabels, xpos = get_chr_ticks(genes_positions_table, data, id_col='gene', chr_col=chr_col) # plot CNV frequencies of all samples with uniq genes data_ampl, data_del = _get_ampl_del_from_data(data) if toPlotFreq: _plot_oncoscan_frequency_plot(data_ampl, data_del, select_samples_title, extra_label, gene_info_fname, xlabels, xpos, saveReport, img_ext, output_directory) # load duplicate genes dictionary # we will need that for the table we will save later fpath = os.path.join(dupl_genes_directory, data_uniq_fname + '.json') with open(fpath, 'r') as fp: dupl_genes_dict = json.load(fp) # separate patient groups and plot their CNV frequencies group0 = data.loc[info_table.index[info_table[clinical_label] == class_values[0]]].copy() group1 = data.loc[info_table.index[info_table[clinical_label] == class_values[1]]].copy() group0_ampl, group0_del = _get_ampl_del_from_data(group0) if toPlotFreq: _plot_oncoscan_frequency_plot(group0_ampl, group0_del, select_samples_title, class_labels[0] + extra_label, gene_info_fname, xlabels, xpos, saveReport, img_ext, output_directory) group1_ampl, group1_del = _get_ampl_del_from_data(group1) if toPlotFreq: _plot_oncoscan_frequency_plot(group1_ampl, group1_del, select_samples_title, class_labels[1] + extra_label, gene_info_fname, xlabels, xpos, saveReport, img_ext, output_directory) if toRemoveDupl: # plot with the duplicate genes too group0_wDupl = data_wDupl.loc[info_table.index[ info_table[clinical_label] == class_values[0]]].copy() group1_wDupl = data_wDupl.loc[info_table.index[ info_table[clinical_label] == class_values[1]]].copy() group0_ampl_wDupl, group0_del_wDupl = \ _get_ampl_del_from_data(group0_wDupl) if toPlotFreq: _plot_oncoscan_frequency_plot(group0_ampl_wDupl, group0_del_wDupl, select_samples_title, class_labels[0], gene_info_fname, xlabels_wDupl, xpos_wDupl, saveReport, img_ext, output_directory) group1_ampl_wDupl, group1_del_wDupl = \ _get_ampl_del_from_data(group1_wDupl) if toPlotFreq: _plot_oncoscan_frequency_plot(group1_ampl_wDupl, group1_del_wDupl, select_samples_title, class_labels[1], gene_info_fname, xlabels_wDupl, xpos_wDupl, saveReport, img_ext, output_directory) # run the Nexus Express diff analysis # select genes with significant p-value (multtest_alpha) # after mutliple test correction (multtest_method) and # absolute change higher than the defined threshold (min_diff_thres) mytitle = select_samples_title+': '+class_labels[0] +\ '['+str(class_values[0])+'] vs. ' +\ class_labels[1]+'['+str(class_values[1])+']' group0_ampl_new, group1_ampl_new, group0_del_new, group1_del_new, \ pvals, pvals_corrected, pvals_reject, gained, deleted = \ get_NexusExpress_diff_analysis( group0_ampl, group1_ampl, group0_del, group1_del, with_perc=with_perc, multtest_method=multtest_method, multtest_alpha=multtest_alpha, min_diff_thres=min_diff_thres, mytitle=mytitle ) # create table with all genes if gene_info_fname is not None: diff_genes = genes_positions_table.set_index( ['gene']).loc[data.columns.values][['chr', 'start', 'end']].copy() else: diff_genes = pd.DataFrame(index=data.columns.values) diff_genes.index.name = 'gene' diff_genes[class_labels[0] + '_' + clinical_label + '_ampl'] = group0_ampl * with_perc diff_genes[class_labels[1] + '_' + clinical_label + '_ampl'] = group1_ampl * with_perc diff_genes[class_labels[0] + '_' + clinical_label + '_del'] = group0_del * with_perc diff_genes[class_labels[1] + '_' + clinical_label + '_del'] = group1_del * with_perc diff_genes['pvals'] = pvals diff_genes['pvals_corrected'] = pvals_corrected diff_genes['pvals_reject'] = pvals_reject diff_genes['gained'] = gained diff_genes['ampl_diff'] = np.abs( diff_genes[class_labels[0] + '_' + clinical_label + '_ampl'] - diff_genes[class_labels[1] + '_' + clinical_label + '_ampl']) diff_genes['deleted'] = deleted diff_genes['del_diff'] = np.abs( diff_genes[class_labels[0] + '_' + clinical_label + '_del'] - diff_genes[class_labels[1] + '_' + clinical_label + '_del']) # add the dupl_genes column only if there are duplicate genes if toRemoveDupl: diff_genes['dupl_genes'] = \ diff_genes.reset_index()['gene'].map(dupl_genes_dict).values # save also the positions of these duplicate genes diff_genes['newGeneName'] = diff_genes.index.values diff_genes.loc[dupl_genes_dict.keys(), 'newGeneName'] += '__wDupl' if gene_info_fname is not None: diff_genes['aggChrGene'] = None diff_genes['aggPos'] = None diff_genes['aggChrStart'] = None diff_genes['aggChrEnd'] = None # for each duplicated gene, aggregate and save # the name, start, end, chr values in the table for agene in dupl_genes_dict.keys(): l = [agene] # if agene in dupl_genes_dict.keys(): l.extend(dupl_genes_dict[agene]) diff_genes.loc[agene, 'aggChrEnd'] = str( natsorted( genes_positions_table.set_index('gene').loc[l]. reset_index().groupby(by=['chr'])['end'].apply( lambda x: list(np.unique(np.append([], x)) )).reset_index().values.tolist())) diff_genes.loc[agene, 'aggChrStart'] = str( natsorted( genes_positions_table.set_index('gene').loc[l]. reset_index().groupby(by=['chr'])['start'].apply( lambda x: list(np.unique(np.append([], x)) )).reset_index().values.tolist())) diff_genes.loc[agene, 'aggChrGene'] = str( natsorted( genes_positions_table.set_index('gene').loc[l]. reset_index().groupby(by=['chr'])['gene'].apply( lambda x: list(np.unique(np.append([], x)) )).reset_index().values.tolist())) aggPos = \ genes_positions_table.set_index('gene').loc[l].groupby( by=['chr']).agg( {'start': min, 'end': max} ).reset_index().astype(str).apply( lambda x: ':'.join(x), axis=1).values diff_genes.loc[agene, 'aggPos'] = np.apply_along_axis( lambda x: '__'.join(x), 0, natsorted(aggPos)) # from the above table: select only the selected genes # according to the Nexus Express diff analysis diff_genes_selected = diff_genes[(diff_genes['gained'] > 0) | (diff_genes['deleted'] > 0)].copy() # save tables if saveReport: fname = 'diff_genes_' + select_samples_title + '.csv' fpath = os.path.join(output_directory, fname) logger.info("-save all diff genes in :\n" + fpath) diff_genes.to_csv(fpath, sep='\t', header=True, index=True) if diff_genes_selected.shape[0] > 0: # keep only those genes in the data data = data.loc[:, diff_genes_selected.index] # change the name of the genes to indicate if they have duplicates if 'newGeneName' in diff_genes_selected.columns.values: newgeneNames = diff_genes_selected.loc[data.columns, 'newGeneName'].values data.columns = newgeneNames # save this data for future classification fname = 'data_features_class.csv' fpath = os.path.join(output_directory, fname) logger.info("-save data with selected diff features for " + mytitle + " and samples class labels in :\n" + fpath) data.to_csv(fpath, sep='\t', header=True, index=True) # save as tab-delimited csv file fname = 'diff_genes_selected_' + select_samples_title + '.csv' fpath = os.path.join(output_directory, fname) logger.info("-save selected diff genes for " + mytitle + " in :\n" + fpath) diff_genes_selected.to_csv(fpath, sep='\t', header=True, index=True) # save also as excel file fname = 'diff_genes_selected_' + select_samples_title + '.xlsx' fpath = os.path.join(output_directory, fname) logger.info('-save csv file as excel too') writer = pd.ExcelWriter(fpath) diff_genes_selected.to_excel(writer, sheet_name=select_samples_title) writer.save() # plot CNV frequencies OF SELECTED GENES for each group in comparison if toPlotFreq: if ((group0_ampl_new != 0).any() or (group0_del_new != 0).any()): _plot_oncoscan_frequency_plot(group0_ampl_new, group0_del_new, select_samples_title + '_DIFF', class_labels[0] + extra_label, gene_info_fname, xlabels, xpos, saveReport, img_ext, output_directory) if ((group1_ampl_new != 0).any() or (group1_del_new != 0).any()): _plot_oncoscan_frequency_plot(group1_ampl_new, group1_del_new, select_samples_title + '_DIFF', class_labels[1] + extra_label, gene_info_fname, xlabels, xpos, saveReport, img_ext, output_directory) if toRemoveDupl: group0_ampl_new_wDupl = group0_ampl_wDupl.copy() group0_ampl_new_wDupl[:] = 0 group1_ampl_new_wDupl = group1_ampl_wDupl.copy() group1_ampl_new_wDupl[:] = 0 group0_del_new_wDupl = group0_del_wDupl.copy() group0_del_new_wDupl[:] = 0 group1_del_new_wDupl = group1_del_wDupl.copy() group1_del_new_wDupl[:] = 0 list__diff_genes_selected_wDupl = [] for i in range(diff_genes_selected.shape[0]): theGene = diff_genes_selected.index[i] genes2edit = [theGene] list__diff_genes_selected_wDupl.extend(genes2edit) duplgenes_ = diff_genes_selected.loc[theGene]['dupl_genes'] if duplgenes_ is not np.nan: list__diff_genes_selected_wDupl.extend(duplgenes_) genes2edit.extend(duplgenes_) group0_ampl_new_wDupl.loc[genes2edit] = group0_ampl_new.loc[ theGene] group1_ampl_new_wDupl.loc[genes2edit] = group1_ampl_new.loc[ theGene] group0_del_new_wDupl.loc[genes2edit] = group0_del_new.loc[theGene] group1_del_new_wDupl.loc[genes2edit] = group1_del_new.loc[theGene] if len(list__diff_genes_selected_wDupl) > 0: # save this data for future classification fname = 'data_all_genes_class.csv' fpath = os.path.join(output_directory, fname) logger.info("-save data with selected diff genes for " + mytitle + " and samples class labels in :\n" + fpath) data_wDupl[list__diff_genes_selected_wDupl].to_csv(fpath, sep='\t', header=True, index=True) if toPlotFreq: # plot with the duplicate genes too if ((group0_ampl_new_wDupl != 0).any() or (group0_del_new_wDupl != 0).any()): _plot_oncoscan_frequency_plot( group0_ampl_new_wDupl, group0_del_new_wDupl, select_samples_title + '_DIFF', class_labels[0], gene_info_fname, xlabels_wDupl, xpos_wDupl, saveReport, img_ext, output_directory) if ((group1_ampl_new_wDupl != 0).any() or (group1_del_new_wDupl != 0).any()): _plot_oncoscan_frequency_plot( group1_ampl_new_wDupl, group1_del_new_wDupl, select_samples_title + '_DIFF', class_labels[1], gene_info_fname, xlabels_wDupl, xpos_wDupl, saveReport, img_ext, output_directory) # PLOT heatmaps of selected features if diff_genes_selected.shape[0] > 0: # get only the CNVs from the selected genes patientNames2plot = pat_labels_txt ds_y, ds_x = data.shape fs_x = 25 if ds_x > 45 else 15 if ds_x > 30 else 10 if ds_x > 3 else 5 fs_y = 20 if ds_y > 40 else 15 if ds_y > 30 else 10 plt.figure(figsize=(fs_x, fs_y)) ax = sns.heatmap(data, vmin=vmin, vmax=vmax, xticklabels=True, yticklabels=patientNames2plot, cmap=cmap_custom, cbar=False) ax.set_ylabel(pat_labels_title) plt.xticks(rotation=90) cbar = ax.figure.colorbar(ax.collections[0]) set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg) plt.title(mytitle) if saveReport: fpath = os.path.join( output_directory, 'Fig_Heatmap_' + select_samples_title + extra_label + img_ext) logger.info('Save Heatmap of selected features as ' + img_ext + ' in:\n' + fpath) plt.savefig(fpath, transparent=True, bbox_inches='tight', pad_inches=0.1, frameon=False) plt.close("all") else: plt.show() if toRemoveDupl: data2plot = data_wDupl[list__diff_genes_selected_wDupl] patientNames2plot = pat_labels_txt ds_y, ds_x = data2plot.shape fs_x = 25 if ds_x > 45 else 15 if ds_x > 30 else 10 fs_y = 20 if ds_y > 40 else 15 if ds_y > 30 else 10 plt.figure(figsize=(fs_x, fs_y)) ax = sns.heatmap(data2plot, vmin=vmin, vmax=vmax, xticklabels=True, yticklabels=patientNames2plot, cmap=cmap_custom, cbar=False) ax.set_ylabel(pat_labels_title) plt.xticks(rotation=90) cbar = ax.figure.colorbar(ax.collections[0]) set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg) plt.title(mytitle) if saveReport: fpath = os.path.join( output_directory, 'Fig_Heatmap_' + select_samples_title + '_wDupl' + img_ext) logger.info('Save Heatmap of selected features as ' + img_ext + ' in:\n' + fpath) plt.savefig(fpath, transparent=True, bbox_inches='tight', pad_inches=0.1, frameon=False) plt.close("all") else: plt.show()
def combine_cohorts(**set_up_kwargs): # initialize script params saveReport = parse_arg_type(set_up_kwargs.get('saveReport', False), bool) toPrint = parse_arg_type(set_up_kwargs.get('toPrint', False), bool) reportName = set_up_kwargs.get('reportName', script_fname) txt_label = set_up_kwargs.get('txt_label', 'test_txt_label') # plotting params plot_kwargs = set_up_kwargs.get('plot_kwargs', {}) function_dict = plot_kwargs.get('function_dict', None) highRes = parse_arg_type(plot_kwargs.get('highRes', False), bool) if highRes: img_ext = '.pdf' else: img_ext = '.png' cmap_custom = plot_kwargs.get('cmap_custom', None) vmin = parse_arg_type(plot_kwargs.get('vmin', None), int) vmax = parse_arg_type(plot_kwargs.get('vmax', None), int) if (cmap_custom is None) and (vmin is not None) and (vmax is not None): custom_div_cmap_arg = abs(vmin) + abs(vmax) if (vmin <= 0) and (vmax >= 0): custom_div_cmap_arg = custom_div_cmap_arg + 1 mincol = plot_kwargs.get('mincol', None) midcol = plot_kwargs.get('midcol', None) maxcol = plot_kwargs.get('maxcol', None) if ((mincol is not None) and (midcol is not None) and (maxcol is not None)): cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg, mincol=mincol, midcol=midcol, maxcol=maxcol) else: cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg) # initialize directories MainDataDir = set_path(os.path.join(script_path, '..', 'data')) # data input data_fpaths = _split_argument_to_list(set_up_kwargs, 'files_to_combine_samples', asPath=True, MainDataDir=MainDataDir) # data output _output_directory = set_up_kwargs.get('output_directory') if ',' in _output_directory: _output_directory = os.path.join(*_output_directory.rsplit(',')) _output_directory = os.path.join(MainDataDir, _output_directory) output_directory = set_directory( os.path.join(_output_directory, reportName)) # save the set_up_kwargs in the output dir for reproducibility fname = 'set_up_kwargs.json' f = os.path.join(output_directory, fname) if toPrint: logger.info('-save set_up_kwargs dictionary for reproducibility in: ' + f) with open(f, 'w') as fp: json.dump(set_up_kwargs, fp, indent=4) # sample_info params sample_info_kwargs = set_up_kwargs.get('sample_info_kwargs', {}) save_new_sample_info = False if sample_info_kwargs: save_new_sample_info = True # sample info input sample_info_fpaths = _split_argument_to_list(sample_info_kwargs, 'sample_info_fpaths', asPath=True, MainDataDir=MainDataDir) sample_info_read_csv_kwargs = sample_info_kwargs.get( 'sample_info_read_csv_kwargs', {}) sample_final_id = _split_argument_to_list(sample_info_kwargs, 'sample_final_id', asPath=False) sample_info_new_label = _split_argument_to_list( sample_info_kwargs, 'sample_info_new_label', asPath=False) sample_info_combine_labels = _split_argument_to_list( sample_info_kwargs, 'sample_info_combine_labels', asPath=False) sample_info_swap_class_label = _split_argument_to_list( sample_info_kwargs, 'sample_info_swap_class_label', asPath=False) # if not isinstance(sample_info_swap_class_label, list): # sample_info_swap_class_label = \ # [sample_info_swap_class_label] # new sample_info output dir new_sample_info_fpath = sample_info_kwargs.get('new_sample_info_fpath') if new_sample_info_fpath is None: new_sample_info_fpath = _output_directory else: if ',' in new_sample_info_fpath: new_sample_info_fpath = os.path.join( *new_sample_info_fpath.rsplit(',')) new_sample_info_fpath = os.path.join(MainDataDir, new_sample_info_fpath) data_dfs = [] if save_new_sample_info: sample_info_tables = [] # load info table of samples for i, fpath in enumerate(data_fpaths): if save_new_sample_info: try: sample_info_read_csv_kwargs[str(i)]['col_as_index'] = \ sample_final_id[i] info_table = load_clinical( sample_info_fpaths[i], **sample_info_read_csv_kwargs[str(i)]) except Exception as ex: logger.error('Load info table of samples FAILED!') logger.error(ex) raise if isinstance(sample_info_swap_class_label[i], list): _sample_info_swap_class_label_list = \ sample_info_swap_class_label[i] else: _sample_info_swap_class_label_list = \ [sample_info_swap_class_label[i]] for j in range(len(_sample_info_swap_class_label_list)): if _sample_info_swap_class_label_list[j] == '': continue logger.warning('The user requested to swap the ' + str(_sample_info_swap_class_label_list[j]) + ' label in the ' + str(i) + ' dataset') info_table[_sample_info_swap_class_label_list[j]] = ( ~info_table[_sample_info_swap_class_label_list[j]].astype( bool)).astype(int) info_table['dataset'] = i # load data fpath = data_fpaths[i] try: df = pd.read_csv(fpath, sep='\t', header=0, index_col=0) logger.info('loaded data file with shape: ' + str(df.shape)) except Exception as ex: logger.error('failed to read data file from: ' + str(fpath)) logger.error(ex) raise data_dfs.append(df) if save_new_sample_info: sample_info_tables.append(info_table) # now we join the cohort samples from the multiple datasets # on the common features (inner join) data = pd.concat(data_dfs, axis=0, join='inner', sort=False) # gene info input gene_info_fpath = set_up_kwargs.get('gene_info_fpath') if gene_info_fpath is not None: if ',' in gene_info_fpath: gene_info_fpath = os.path.join(*gene_info_fpath.rsplit(',')) gene_info_fpath = os.path.join(MainDataDir, gene_info_fpath) chr_col = set_up_kwargs.get('chr_col', 'chr_int') gene_id_col = set_up_kwargs.get('gene_id_col', 'gene') # load gene info try: genes_positions_table = pd.read_csv(gene_info_fpath, sep='\t', header=0, index_col=0) # get gene chrom position xlabels, xpos = get_chr_ticks(genes_positions_table, data, id_col='gene', chr_col=chr_col) except Exception as ex: logger.warning('could not get genes position info') logger.warning(ex) xlabels, xpos = None, None else: xlabels, xpos = None, None if save_new_sample_info: # do the same for the info_tables # but keep all collumns (outer join) sample_info = pd.concat(sample_info_tables, axis=0, join='outer', sort=False) sample_info.index.name = 'patientID' # create new label name by merging existing labels # (when no common label name between cohorts) if sample_info_new_label is not None: if isinstance(sample_info_new_label, list): _sample_info_new_label_list = \ sample_info_new_label _sample_info_combine_labels_list = \ sample_info_combine_labels else: _sample_info_new_label_list = \ [sample_info_new_label] _sample_info_combine_labels_list = \ [sample_info_combine_labels] for l, new_label in enumerate(_sample_info_new_label_list): combine_labels = _sample_info_combine_labels_list[l] sample_info[new_label] = sample_info[combine_labels].sum(axis=1) logger.info('combined labels: ' + str(combine_labels) + 'into the new label: ' + str(new_label)) # sort the samples by name all_samples = natsorted(data.index.values) data = data.loc[all_samples, :] # heatmap of combined data (on samples) _figure_x_size, _figure_y_size, _show_gene_names, _show_sample_names = \ set_heatmap_size(data) plt.figure(figsize=(_figure_x_size, _figure_y_size)) ax = sns.heatmap(data, vmin=vmin, vmax=vmax, yticklabels=_show_sample_names, xticklabels=_show_gene_names, cmap=cmap_custom, cbar=False) plt.xticks(rotation=90) cbar = ax.figure.colorbar(ax.collections[0]) set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg) plt.title(txt_label) if saveReport: logger.info('Save heatmap') fpath = os.path.join(output_directory, 'Fig_heatmap' + img_ext) plt.savefig(fpath, transparent=True, bbox_inches='tight', pad_inches=0.1, frameon=False) plt.close("all") else: plt.show() ######################################### if (xlabels is not None) and (xpos is not None): # ORDER genes if toPrint: logger.info('Order data according to genomic position') # extract the gene relative order gene_order = genes_positions_table.set_index( gene_id_col).loc[:, 'order'].copy() # keep only gene_order with data ids_tmp = set(gene_order.index.values).intersection( set(data.columns.values)) # keep only the order of these genes gene_order = gene_order.loc[ids_tmp].copy() gene_order = gene_order.sort_values() # then keep only these genes from the data data2plot = data.loc[:, gene_order.index].copy() # PLOT heatmap after gene ordering if toPrint: logger.info('Plot heatmap after gene ordering') _figure_x_size, _figure_y_size, \ _show_gene_names, _show_sample_names = \ set_heatmap_size(data2plot) plt.figure(figsize=(_figure_x_size, _figure_y_size)) ax = sns.heatmap(data2plot, vmin=vmin, vmax=vmax, xticklabels=_show_gene_names, yticklabels=_show_sample_names, cmap=cmap_custom, cbar=False) ax.set_xticks(xpos) ax.set_xticklabels(xlabels, rotation=0) cbar = ax.figure.colorbar(ax.collections[0]) set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg) if saveReport: if toPrint: logger.info('Save heatmap') plt.savefig(os.path.join(output_directory, 'Fig_heatmap_ordered' + img_ext), transparent=True, bbox_inches='tight', pad_inches=0.1, frameon=False) plt.close("all") else: plt.show() ######################################### # save the combined data fname = 'integrated_data.csv' fpath = os.path.join(output_directory, fname) logger.info('-save the combined data from different cohorts ' + 'with shape:\n' + str(data.shape)) data.to_csv(fpath, sep='\t') if save_new_sample_info: # save the sample_info fname = 'integrated_sample_info.csv' fpath = os.path.join(new_sample_info_fpath, fname) logger.info('-save the combined sample_info from different cohorts ' + 'with shape:\n' + str(sample_info.shape)) sample_info.to_csv(fpath, sep='\t')
def set_up_data(**set_up_kwargs): # initialize script params saveReport = parse_arg_type(set_up_kwargs.get('saveReport', False), bool) toPrint = parse_arg_type(set_up_kwargs.get('toPrint', False), bool) reportName = set_up_kwargs.get('reportName', script_fname) # load_data_csv_kwargs = set_up_kwargs.get( # 'load_data_csv_kwargs', {} # ) editWith = set_up_kwargs.get('editWith', 'Oncoscan') if 'VCF' in editWith: _edit_kwargs = set_up_kwargs.get('edit_kwargs', {}) function_dict = _edit_kwargs.get('function_dict', None) txt_label = set_up_kwargs.get('txt_label', 'test_txt_label') select_genes = set_up_kwargs.get('select_genes', None) if select_genes is None or select_genes == "": select_genes_list = [] else: select_genes_list = select_genes.rsplit(',') chr_col = set_up_kwargs.get('chr_col', 'chr_int') gene_id_col = set_up_kwargs.get('gene_id_col', 'gene') sample_info_fname = set_up_kwargs.get('sample_info_fname', '20180704_emca.csv') if ',' in sample_info_fname: sample_info_fname = os.path.join(*sample_info_fname.rsplit(',')) sample_info_read_csv_kwargs = set_up_kwargs.get( 'sample_info_read_csv_kwargs', {}) sample_info_table_sortLabels = \ set_up_kwargs.get('sample_info_table_sortLabels', None) sample_info_table_sortLabels_list = \ sample_info_table_sortLabels.rsplit(',') # plotting params plot_kwargs = set_up_kwargs.get('plot_kwargs', {}) highRes = parse_arg_type(plot_kwargs.get('highRes', False), bool) if highRes: img_ext = '.pdf' else: img_ext = '.png' # initialize directories MainDataDir = set_path(os.path.join(script_path, '..', 'data')) input_directory = set_up_kwargs.get('input_directory') if ',' in input_directory: input_directory = os.path.join(*input_directory.rsplit(',')) input_directory = os.path.join(MainDataDir, input_directory) output_directory = set_up_kwargs.get('output_directory') if output_directory is None: output_directory = set_directory( os.path.join(input_directory, reportName)) else: if ',' in output_directory: output_directory = os.path.join(*output_directory.rsplit(',')) output_directory = set_directory( os.path.join(MainDataDir, output_directory, reportName)) # save the set_up_kwargs in the output dir for reproducibility fname = 'set_up_kwargs.json' f = os.path.join(output_directory, fname) if toPrint: logger.info('-save set_up_kwargs dictionary for reproducibility in: ' + f) with open(f, 'w') as fp: json.dump(set_up_kwargs, fp, indent=4) data_directory = set_up_kwargs.get('data_directory', None) if data_directory is None: data_directory = input_directory else: if ',' in data_directory: data_directory = os.path.join(*data_directory.rsplit(',')) data_files = set_up_kwargs.get('data_files', '') data_files_list = data_files.rsplit(',') if len(data_files_list) > 0: fpaths = [ os.path.join(input_directory, data_directory, aFile) for aFile in data_files_list ] else: fpaths = os.path.join(input_directory, data_directory) # load info table of samples if toPrint: logger.info('Load info table of samples') fpath = os.path.join(input_directory, sample_info_fname) info_table = load_clinical(fpath, **sample_info_read_csv_kwargs) if toPrint: logger.info('Missing values for each column:\n') info_table_isna_sum = info_table.isna().sum() for _i in range(info_table_isna_sum.shape[0]): logger.info( str(info_table_isna_sum.index[_i]) + '\t' + str(info_table_isna_sum.iloc[_i])) ######################################### if 'genepanel' in editWith: # print('ERROR: undefined scenario!') # raise edit_kwargs = set_up_kwargs.get('edit_kwargs', {}) # load data table if len(fpaths) > 0: logger.error('more than one data files were given!\n' + str(fpaths)) variants = pd.read_csv(fpaths[0], sep='\t', header=0) # EDIT: # map function impact to value with function_dict # substitute allele frequencies with impact values # aggregate rows to unique genes, choose how to merge # remove patients data = edit_genepanel(variants, **edit_kwargs) xlabels, xpos = None, None # plot heatmap # save formated data # (optional) format data to binary # plot heatmap # save formated data else: load_files = parse_arg_type(set_up_kwargs.get('load_files', False), bool) # load data/files from each patient if load_files: if toPrint: logger.info(txt_label + ': load files from all patients\n') pat_data_list, pat_data_or_dict, dropped_rows_filter, \ dropped_rows_process, dropped_rows_edit, info_table = \ load_and_process_files( fpaths, info_table, **set_up_kwargs) else: if toPrint: logger.info(txt_label + ': load data from all patients\n') pat_data_list, pat_data_or_dict, dropped_rows_filter, \ dropped_rows_process, dropped_rows_edit, info_table = \ load_and_process_summary_file( fpaths, info_table, **set_up_kwargs) if (dropped_rows_filter.shape[0] > 0) and (saveReport): f_new = 'allsamples__dropped_rows_filter.txt' if toPrint: logger.info('-save dropped rows from filtering in:\n' + f_new) dropped_rows_filter.to_csv(os.path.join(output_directory, f_new), sep='\t', header=True, index=True) if (dropped_rows_process.shape[0] > 0) and (saveReport): f_new = 'allsamples__dropped_rows_process.txt' if toPrint: logger.info('-save dropped rows from processing in:\n' + f_new) dropped_rows_process.to_csv(os.path.join(output_directory, f_new), sep='\t', header=True, index=True) if (dropped_rows_edit.shape[0] > 0) and (saveReport): f_new = 'allsamples__dropped_rows_edit.txt' if toPrint: logger.info('-save dropped rows from editing in:\n' + f_new) dropped_rows_edit.to_csv(os.path.join(output_directory, f_new), sep='\t', header=True, index=True) # get size of each sample # (i.e. abundance of genes with in each sample) # and plot it counts = [] sample_labels = [] for df in pat_data_list: counts.append(df.shape[0]) sample_labels.append(df.columns[0].rsplit(':')[0]) ################################################## # concat all samples in one table and keep union of all genes, # then fill NaNs with zero if toPrint: logger.info('Concantanate all ' + editWith + ' samples in 2 tables (with position, only values)\n') # samples in rows, genes in columns table_withPos = pd.concat(pat_data_list, join='outer', axis=1, sort=False).T # CLEAN THE data FROM ALL SAMPLES # extract the start, end and chrom info from the table # and keep only the functions values start_table = \ table_withPos[table_withPos.index.str.contains('start')].copy() end_table = table_withPos[table_withPos.index.str.contains( 'end')].copy() chr_table = table_withPos[table_withPos.index.str.contains( 'chr')].copy() data = table_withPos.drop(np.concatenate( [start_table.index, end_table.index, chr_table.index], axis=0), axis=0) if toPrint: logger.info('Dimensions of data (samples,genes): ' + str(data.shape)) data.index = [index_name.rsplit(':')[0] for index_name in data.index] start_table.index = [ index_name.rsplit(':')[0] for index_name in start_table.index ] end_table.index = [ index_name.rsplit(':')[0] for index_name in end_table.index ] chr_table.index = [ index_name.rsplit(':')[0] for index_name in chr_table.index ] # remove genes that exist in multiple chromosomes across samples ll = [ list(chr_table[col].dropna().unique()) for col in chr_table.columns ] n, m = max(map(len, ll)), len(ll) uniq_chr_per_gene = pd.DataFrame( [[_uniq_chr_per_gene(j, i) for j in ll] for i in range(n)], columns=chr_table.columns) genes2drop = uniq_chr_per_gene.columns[( ~uniq_chr_per_gene.isnull()).sum() > 1].values if toPrint: logger.info('Remove ' + str(genes2drop.shape[0]) + ' genes that exist in multiple chromosomes ' + 'across samples:\n' + str(genes2drop)) if (genes2drop.shape[0] > 0): # if saveReport: # fname = 'chr_table.csv' # f = os.path.join(output_directory, fname) # if toPrint: # logger.info('-save chromosomes in: '+f) # chr_table.to_csv(f, sep='\t', header=True, index=True) # fname = 'chr_table_uniq.csv' # f = os.path.join(output_directory, fname) # if toPrint: # logger.info('-save unique chromosomes in: '+f) # uniq_chr_per_gene.to_csv( # f, sep='\t', header=True, index=True) # fname = 'chr_table_uniq_genes2drop.csv' # f = os.path.join(output_directory, fname) # if toPrint: # logger.info('-save unique chromosomes ' + # 'from genes to drop in: '+f) # uniq_chr_per_gene.loc[:, genes2drop].to_csv(f, sep='\t', # header=True, # index=True) start_table.drop(genes2drop, axis=1, inplace=True) end_table.drop(genes2drop, axis=1, inplace=True) chr_table.drop(genes2drop, axis=1, inplace=True) data.drop(genes2drop, axis=1, inplace=True) uniq_chr_per_gene.drop(genes2drop, axis=1, inplace=True) uniq_chr_per_gene = uniq_chr_per_gene.iloc[0, :].copy() if toPrint: logger.info('Dimensions of data (samples,genes):' + str(data.shape)) else: uniq_chr_per_gene = uniq_chr_per_gene.iloc[0, :].copy() # ORDER THE GENES FROM ALL SAMPLES (SLOW?) if toPrint: logger.info('Create a Dataframe with the genes ' + 'and their genomic positions') gene_pos = pd.concat([ start_table.apply(lambda x: pd.to_numeric( x, errors='ignore', downcast='integer')).min().astype(int), end_table.apply(lambda x: pd.to_numeric( x, errors='ignore', downcast='integer')).max().astype(int), uniq_chr_per_gene ], axis=1, sort=False) gene_pos.columns = ['start', 'end', 'chr'] gene_pos.index.name = gene_id_col gene_pos.reset_index(inplace=True) gene_pos['chr_gene'] = gene_pos['chr'] + ':' + gene_pos[gene_id_col] gene_pos[chr_col] = gene_pos['chr'].str.split('chr', 2).str[1] gene_pos['toNatSort'] = [ ':'.join([ str(gene_pos[chr_col][row]), str(gene_pos['start'][row]), str(gene_pos['end'][row]) ]) for row in range(gene_pos.shape[0]) ] if toPrint: logger.info('Dataframes agree (?): ' + str(gene_pos.shape[0] == data.shape[1])) # are the genes duplicated ? dupl_genes = gene_pos[gene_id_col].duplicated() if dupl_genes.any(): logger.error('genes are duplicated, check your data first!') logger.info('duplicated genes:' + gene_pos[gene_id_col][dupl_genes].values) raise () else: if toPrint: logger.info('gene names are unique, continue..') if toPrint: logger.info('Order genes according to genomic position') gene_order = index_natsorted(gene_pos['toNatSort']) gene_pos = gene_pos.iloc[gene_order, :].copy() gene_pos.reset_index(drop=True, inplace=True) gene_pos.index.name = 'order' gene_pos.reset_index(inplace=True) ######################################### # CREATE dictionary of gene names and their order gene_order_dict = dict( (gene_pos[gene_id_col][i], int(gene_pos['order'][i])) for i in range(gene_pos.shape[0])) xlabels, xpos = get_chr_ticks(gene_pos, data, id_col=gene_id_col, chr_col=chr_col) # SAVE ordered table and gene pos info table if saveReport: fname = 'genes_info.csv' f = os.path.join(output_directory, fname) if toPrint: logger.info('-save genes info: ' + f) gene_pos.to_csv(f, sep='\t', header=True, index=True) fname = 'gene_order_dict.json' f = os.path.join(output_directory, fname) if toPrint: logger.info('-save genes order dictionary: ' + f) with open(f, 'w') as fp: json.dump(gene_order_dict, fp, indent=4) ######################################### for label in [ 'rows_in_sample', 'rows_in_sample_filt', 'rows_in_sample_processed', 'rows_in_sample_editted' ]: if label in info_table.columns: # PLOT Abundance of gene data per sample if toPrint: logger.info('Plot ' + label + ' for each sample') mutCount = info_table[[label]].copy() patient_new_order = info_table.loc[mutCount.index].sort_values( by=sample_info_table_sortLabels_list) xticklabels = list( zip( patient_new_order.index.values, info_table.loc[ patient_new_order.index, sample_info_table_sortLabels_list].values)) mutCount = mutCount.loc[patient_new_order.index] rank = mutCount[label].argsort().argsort().values pal = sns.cubehelix_palette(mutCount.shape[0], reverse=True, dark=.40, light=.95) plt.figure(figsize=(10, 5)) g = sns.barplot(np.arange(mutCount.shape[0]), mutCount[label], palette=np.array(pal[::-1])[rank]) g.set_xticklabels(xticklabels, rotation=90) g.set(xlabel='samples', ylabel='count') g.set_title('Abundance of ' + label + ' per sample: ' + str((mutCount[label] <= 0).sum()) + ' empty samples') if saveReport: logger.info('Save figure') plt.savefig(os.path.join(output_directory, 'Fig_samples_' + label + img_ext), transparent=True, bbox_inches='tight', pad_inches=0.1, frameon=False) plt.close("all") else: plt.show() ######################################### # # SAVE data w/ and w/o positions # if saveReport: # # save data # fname = 'table_withPos.csv' # f = os.path.join(output_directory, fname) # if toPrint: # logger.info('-save data in: '+f) # table_withPos.to_csv(f, sep='\t', header=True, index=True) # if saveReport: # # save data # fname = 'data.csv' # f = os.path.join(output_directory, fname) # if toPrint: # logger.info('-save data in: '+f) # data.to_csv(f, sep='\t', header=True, index=True) # if toPrint: # logger.info( # 'Dimensions of data (samples,genes):'+str(data.shape)) # -- END IF -- # ######################################### # SAVE data and sample_info if saveReport: # save files fname = 'data.csv' f = os.path.join(output_directory, fname) if toPrint: logger.info('-save ordered data: ' + f) data.to_csv(f, sep='\t', header=True, index=True) fname = 'sample_info.csv' f = os.path.join(output_directory, fname) if toPrint: logger.info('-save sample_info: ' + f) info_table.to_csv(f, sep='\t', header=True, index=True)
def process_data(**set_up_kwargs): # initialize script params saveReport = parse_arg_type(set_up_kwargs.get('saveReport', False), bool) toPrint = parse_arg_type(set_up_kwargs.get('toPrint', False), bool) reportName = set_up_kwargs.get('reportName', script_fname) input_fname = set_up_kwargs.get('input_fname', 'data_processed.csv') gene_info_fname = set_up_kwargs.get('gene_info_fname', 'gene_info_fname.csv') txt_label = set_up_kwargs.get('txt_label', 'test_txt_label') chr_col = set_up_kwargs.get('chr_col', 'chr_int') gene_id_col = set_up_kwargs.get('gene_id_col', 'gene') remove_patients = set_up_kwargs.get('remove_patients', None) if remove_patients is None or remove_patients == "": remove_patients_list = [] else: remove_patients_list = remove_patients.rsplit(',') select_genes = set_up_kwargs.get('select_genes', None) if select_genes is None or select_genes == "": select_genes_list = [] else: select_genes_list = select_genes.rsplit(',') sample_info_fname = set_up_kwargs.get('sample_info_fname', '20180704_emca.csv') if ',' in sample_info_fname: sample_info_fname = os.path.join(*sample_info_fname.rsplit(',')) sample_info_read_csv_kwargs = set_up_kwargs.get( 'sample_info_read_csv_kwargs', {}) old_data_sample_id = set_up_kwargs.get('old_data_sample_id', None) if old_data_sample_id is not None: change_id = True else: change_id = False # chose sample set from data # function: choose_samples() select_samples_from = set_up_kwargs.get('select_samples_from', None) select_samples_which = parse_arg_type( set_up_kwargs.get('select_samples_which', None), int) select_samples_sort_by = set_up_kwargs.get('select_samples_sort_by', None) if select_samples_sort_by is not None: select_samples_sort_by = select_samples_sort_by.rsplit(',') # map_values_dict map_values = set_up_kwargs.get('map_values', None) if map_values is not None: map_values_dict = None if isinstance(map_values, dict): map_values_dict = {int(k): int(v) for k, v in map_values.items()} # plotting params plot_kwargs = set_up_kwargs.get('plot_kwargs', {}) function_dict = plot_kwargs.get('function_dict', None) cmap_custom = plot_kwargs.get('cmap_custom', None) vmin = parse_arg_type(plot_kwargs.get('vmin', None), int) vmax = parse_arg_type(plot_kwargs.get('vmax', None), int) if (cmap_custom is None) and (vmin is not None) and (vmax is not None): custom_div_cmap_arg = abs(vmin) + abs(vmax) if (vmin <= 0) and (vmax >= 0): custom_div_cmap_arg = custom_div_cmap_arg + 1 mincol = plot_kwargs.get('mincol', None) midcol = plot_kwargs.get('midcol', None) maxcol = plot_kwargs.get('maxcol', None) if ((mincol is not None) and (midcol is not None) and (maxcol is not None)): cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg, mincol=mincol, midcol=midcol, maxcol=maxcol) else: cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg) highRes = parse_arg_type(plot_kwargs.get('highRes', False), bool) if highRes: img_ext = '.pdf' else: img_ext = '.png' # initialize directories MainDataDir = set_path(os.path.join(script_path, '..', 'data')) # data input input_directory = set_up_kwargs.get('input_directory') if ',' in input_directory: input_directory = os.path.join(*input_directory.rsplit(',')) input_directory = os.path.join(MainDataDir, input_directory) # sample info input sample_info_directory = set_up_kwargs.get('sample_info_directory') if ',' in sample_info_directory: sample_info_directory = os.path.join( *sample_info_directory.rsplit(',')) sample_info_directory = os.path.join(MainDataDir, sample_info_directory) # gene info input gene_info_directory = set_up_kwargs.get('gene_info_directory') if gene_info_directory is None: gene_info_directory = input_directory else: if ',' in gene_info_directory: gene_info_directory = os.path.join( *gene_info_directory.rsplit(',')) gene_info_directory = os.path.join(MainDataDir, gene_info_directory) # data output output_directory = set_up_kwargs.get('output_directory') if output_directory is None: output_directory = set_directory( os.path.join(input_directory, reportName)) else: if ',' in output_directory: output_directory = os.path.join(*output_directory.rsplit(',')) output_directory = set_directory( os.path.join(MainDataDir, output_directory, reportName)) # save the set_up_kwargs in the output dir for reproducibility fname = 'set_up_kwargs.json' f = os.path.join(output_directory, fname) if toPrint: logger.info('-save set_up_kwargs dictionary for reproducibility in: ' + f) with open(f, 'w') as fp: json.dump(set_up_kwargs, fp, indent=4) ######################################### # load input_data fpath = os.path.join(input_directory, input_fname) data = pd.read_csv(fpath, sep='\t', header=0, index_col=0) data = data.fillna(0) # load info table of samples if toPrint: logger.info('Load info table of samples') fpath = os.path.join(sample_info_directory, sample_info_fname) info_table = load_clinical(fpath, **sample_info_read_csv_kwargs) # load gene info fpath = os.path.join(gene_info_directory, gene_info_fname) try: genes_positions_table = pd.read_csv(fpath, sep='\t', header=0, index_col=0) # get gene chrom position xlabels, xpos = get_chr_ticks(genes_positions_table, data, id_col='gene', chr_col=chr_col) except: logger.warning('could not get genes position info') xlabels, xpos = None, None ######################################### # CHECK if there are empty genes and remove them is_empty = (data.isnull()).all(axis=0) if is_empty.any(): genes2remove = data.columns[is_empty] data.drop(genes2remove, axis=1, inplace=True) if toPrint: logger.info('remove the following genes because ' + 'they have no values in the table: ' + str(genes2remove)) # CHECK if there are empty patients BUT don't remove them empty_pat = data.sum(axis=1).isnull() if empty_pat.any(): logger.info('Patients with missing values in all genes: ' + str(data.index[empty_pat])) # SELECT specific genes (optional) if len(select_genes_list) > 0: # first take intersection of with data select_genes_list = list( set(data.columns.values).intersection(set(select_genes_list))) # then keep only these genes from in the data data = data.loc[:, select_genes_list].copy() if genes_positions_table is not None: xlabels, xpos = get_chr_ticks(genes_positions_table, data, id_col='gene', chr_col=chr_col) # MAP values with a dictionary (optional) if map_values is not None: if map_values_dict is not None: _diff_set = set(np.unique(data.values.flatten().astype(int)))\ .difference(set([0]))\ .difference(set(list(map_values_dict.keys()))) if _diff_set: logger.warning( "the user\'s dict to replace data values is incomplete " + "the following values in the data are not accounted for " + "and will remain the same:\n" + str(_diff_set)) logger.info("replacing data values with user\'s dictionary:\n" + str(map_values_dict)) data.replace(map_values_dict, inplace=True) elif map_values in ['bin', 'binary', 'binarize']: logger.info("binarizing data values" + str(map_values_dict)) binarize(data, copy=False) else: logger.warning( "invalid map_values argument, no action will be taken: \n" + str(map_values)) # SELECT sample groups (optional) if change_id: temp = info_table.index.name info_table = ( info_table # delete NaNs in the data id .dropna(subset=[old_data_sample_id]) # do not lose original index .reset_index() # same index as data .set_index(old_data_sample_id, drop=False) # keep same as data ids .loc[data.index] # return back to original index .set_index(temp) ) else: temp = info_table.index.name info_table = info_table.loc[data.index].copy() info_table.index.name = temp ids_tmp = choose_samples(info_table.reset_index(), info_table.index.name, choose_from=select_samples_from, choose_what=select_samples_which, sortby=select_samples_sort_by, ascending=False) info_table = info_table.loc[ids_tmp, :].copy() if change_id: old_index_sorted = info_table[old_data_sample_id].values.copy() data = data.loc[old_index_sorted, :].copy() new_ids = info_table.index.values # data = data.reindex(new_ids, axis=0) # gives me nan values! data.index = new_ids # in case there are patients with no data data.dropna(axis=0, inplace=True) info_table = info_table.loc[data.index, :].copy() else: data = data.loc[ids_tmp, :].copy() pat_labels = info_table[select_samples_sort_by].copy() try: pat_labels_txt = pat_labels.astype(int).reset_index().values except: pat_labels_txt = pat_labels.reset_index().values pat_labels_title = str(pat_labels.reset_index().columns.values) # PLOT heatmap without gene ordering if toPrint: logger.info('Plot heatmap before gene ordering') _figure_x_size, _figure_y_size, _show_gene_names, _ = \ set_heatmap_size(data) plt.figure(figsize=(_figure_x_size, _figure_y_size)) ax = sns.heatmap(data, vmin=vmin, vmax=vmax, xticklabels=_show_gene_names, yticklabels=pat_labels_txt, cmap=cmap_custom, cbar=False) ax.set_ylabel(pat_labels_title) plt.xticks(rotation=90) cbar = ax.figure.colorbar(ax.collections[0]) set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg) if saveReport: if toPrint: logger.info('Save heatmap') plt.savefig(os.path.join(output_directory, 'Fig_heatmap' + img_ext), transparent=True, bbox_inches='tight', pad_inches=0.1, frameon=False) plt.close("all") else: plt.show() ######################################### if (xlabels is not None) and (xpos is not None): # ORDER genes if toPrint: logger.info('Order data according to genomic position') # extract the gene relative order gene_order = genes_positions_table.set_index( gene_id_col).loc[:, 'order'].copy() # keep only gene_order with data ids_tmp = set(gene_order.index.values).intersection( set(data.columns.values)) # keep only the order of these genes gene_order = gene_order.loc[ids_tmp].copy() gene_order = gene_order.sort_values() # then keep only these genes from the data data = data.loc[:, gene_order.index].copy() # data = pd.DataFrame(data, columns=sorted( # gene_order_dict, key=gene_order_dict.get)) # PLOT heatmap after gene ordering if toPrint: logger.info('Plot heatmap after gene ordering') _figure_x_size, _figure_y_size, _show_gene_names, _ = \ set_heatmap_size(data) plt.figure(figsize=(_figure_x_size, _figure_y_size)) ax = sns.heatmap(data, vmin=vmin, vmax=vmax, xticklabels=_show_gene_names, yticklabels=pat_labels_txt, cmap=cmap_custom, cbar=False) ax.set_xticks(xpos) ax.set_xticklabels(xlabels, rotation=90) ax.set_ylabel(pat_labels_title) cbar = ax.figure.colorbar(ax.collections[0]) set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg) if saveReport: if toPrint: logger.info('Save heatmap') plt.savefig(os.path.join(output_directory, 'Fig_heatmap_ordered' + img_ext), transparent=True, bbox_inches='tight', pad_inches=0.1, frameon=False) plt.close("all") else: plt.show() ######################################### # SAVE filtered data if saveReport: # save files fname = 'data_processed.csv' f = os.path.join(output_directory, fname) if toPrint: logger.info('-save ordered data: ' + f) data.to_csv(f, sep='\t', header=True, index=True)
def remove_duplicate_genes(**set_up_kwargs): # chose sample set from data select_samples_from = set_up_kwargs.get('select_samples_from', None) select_samples_which = parse_arg_type( set_up_kwargs.get('select_samples_which', None), int ) select_samples_sort_by = set_up_kwargs.get('select_samples_sort_by', None) if select_samples_sort_by is not None: select_samples_sort_by = select_samples_sort_by.rsplit(',') select_samples_title = set_up_kwargs.get('select_samples_title', 'select_all') # initialize script params saveReport = parse_arg_type( set_up_kwargs.get('saveReport', False), bool ) toPrint = parse_arg_type( set_up_kwargs.get('toPrint', False), bool ) reportName = set_up_kwargs.get('reportName', script_fname) txt_label = set_up_kwargs.get('txt_label', 'test_txt_label') input_fname = set_up_kwargs.get('input_fname', 'data_processed.csv') gene_info_fname = set_up_kwargs.get('gene_info_fname', None) chr_col = set_up_kwargs.get('chr_col', 'chr_int') gene_id_col = set_up_kwargs.get('gene_id_col', 'gene') sample_info_fname = set_up_kwargs.get('sample_info_fname', None) if ',' in sample_info_fname: sample_info_fname = os.path.join(*sample_info_fname.rsplit(',')) sample_info_read_csv_kwargs = set_up_kwargs.get( 'sample_info_read_csv_kwargs', {}) # plotting params plot_kwargs = set_up_kwargs.get('plot_kwargs', {}) function_dict = plot_kwargs.get('function_dict', None) cmap_custom = plot_kwargs.get('cmap_custom', None) vmin = parse_arg_type( plot_kwargs.get('vmin', None), int ) vmax = parse_arg_type( plot_kwargs.get('vmax', None), int ) if (cmap_custom is None) and (vmin is not None) and (vmax is not None): custom_div_cmap_arg = abs(vmin)+abs(vmax) if (vmin <= 0) and (vmax >= 0): custom_div_cmap_arg = custom_div_cmap_arg + 1 mincol = plot_kwargs.get('mincol', None) midcol = plot_kwargs.get('midcol', None) maxcol = plot_kwargs.get('maxcol', None) if ( (mincol is not None) and (midcol is not None) and (maxcol is not None) ): cmap_custom = custom_div_cmap( numcolors=custom_div_cmap_arg, mincol=mincol, midcol=midcol, maxcol=maxcol) else: cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg) highRes = parse_arg_type( plot_kwargs.get('highRes', False), bool ) if highRes: img_ext = '.pdf' else: img_ext = '.png' # initialize directories MainDataDir = set_path(os.path.join(script_path, '..', 'data')) # data input input_directory = set_up_kwargs.get('input_directory') if ',' in input_directory: input_directory = os.path.join(*input_directory.rsplit(',')) input_directory = os.path.join(MainDataDir, input_directory) # sample info input sample_info_directory = set_up_kwargs.get('sample_info_directory') if ',' in sample_info_directory: sample_info_directory = os.path.join( *sample_info_directory.rsplit(',')) sample_info_directory = os.path.join(MainDataDir, sample_info_directory) # gene info input gene_info_directory = set_up_kwargs.get('gene_info_directory') if gene_info_directory is None: gene_info_directory = input_directory else: if ',' in gene_info_directory: gene_info_directory = os.path.join( *gene_info_directory.rsplit(',')) gene_info_directory = os.path.join( MainDataDir, gene_info_directory) # data output output_directory = set_up_kwargs.get('output_directory') if output_directory is None: output_directory = set_directory( os.path.join(input_directory, reportName) ) else: if ',' in output_directory: output_directory = os.path.join(*output_directory.rsplit(',')) output_directory = set_directory( os.path.join(MainDataDir, output_directory, reportName) ) # save the set_up_kwargs in the output dir for reproducibility fname = 'set_up_kwargs.json' f = os.path.join(output_directory, fname) if toPrint: logger.info( '-save set_up_kwargs dictionary for reproducibility in: '+f) with open(f, 'w') as fp: json.dump(set_up_kwargs, fp, indent=4) # pairwise distances params compute_pdist = parse_arg_type( set_up_kwargs.get('compute_pdist', False), bool ) pdist_fname = 'data_'+select_samples_title+'__genes_pdist.h5' pdist_fpath = os.path.join(input_directory, pdist_fname) if not os.path.exists(pdist_fpath): compute_pdist = True # load info table of samples if toPrint: logger.info('Load info table of samples') fpath = os.path.join(sample_info_directory, sample_info_fname) info_table = load_clinical(fpath, **sample_info_read_csv_kwargs) # load input_data fpath = os.path.join(input_directory, input_fname) input_data = pd.read_csv(fpath, sep='\t', header=0, index_col=0) empty_pat = input_data.sum(axis=1).isnull() if empty_pat.any(): logger.info('Patients with missing values in all genes: ' + str(input_data.index[empty_pat])) input_data = input_data.fillna(0) # keep only info_table with input_data ids_tmp = set(info_table.index.values ).intersection(set(input_data.index.values)) info_table = info_table.loc[ids_tmp].copy() # info_table = info_table.reset_index() # load gene info fpath = os.path.join(gene_info_directory, gene_info_fname) genes_positions_table = pd.read_csv(fpath, sep='\t', header=0, index_col=0) # get gene chrom position xlabels, xpos = get_chr_ticks(genes_positions_table, input_data, id_col=gene_id_col, chr_col=chr_col) logger.info('select_samples_from: '+str(select_samples_from) + 'select_samples_which: '+str(select_samples_which) + 'select_samples_sort_by: '+str(select_samples_sort_by) + 'select_samples_title: '+str(select_samples_title)) # keep only info_table with data temp = info_table.index.name info_table = info_table.loc[input_data.index].copy() info_table.index.name = temp ids_tmp = choose_samples(info_table.reset_index(), info_table.index.name, choose_from=select_samples_from, choose_what=select_samples_which, sortby=select_samples_sort_by, ascending=False) # keep a subpart of the info_table (rows and columns) info_table = info_table.loc[ids_tmp, select_samples_sort_by].copy() # keep only these samples from the data data = input_data.loc[ids_tmp, :].copy() try: pat_labels_txt = info_table.astype(int).reset_index().values except: pat_labels_txt = info_table.reset_index().values pat_labels_title = str(info_table.reset_index().columns.values) # remove all zero columns! orphancols = np.where(abs(data).sum(axis=0) == 0)[0] if len(orphancols) > 0: logger.warning( 'removing '+str(len(orphancols)) + ' genes from data with zero columns!') cols2drop = data.columns.values[orphancols] data = data.drop(cols2drop, axis=1).copy() # REMOVE DUPLICATES!!!! uniqdata, dupldict, _, _ = remove_andSave_duplicates( data, to_compute_euclidean_distances=compute_pdist, to_save_euclidean_distances=saveReport, to_save_output=saveReport, output_filename=input_fname.rsplit('.')[0]+'__'+select_samples_title, output_directory=output_directory) # get gene chrom position xlabels_uniq, xpos_uniq = get_chr_ticks( genes_positions_table, uniqdata, id_col=gene_id_col, chr_col=chr_col) fext = ['', '_uniq'] xlabels_choose = [xlabels, xlabels_uniq] xpos_choose = [xpos, xpos_uniq] for i_data, choose_data in enumerate([data, uniqdata]): if select_samples_which is None: # distplot DO NOT break Y-axis logger.info('Plotting distplot..') sns.distplot(choose_data.values.flatten(), hist=True, kde=False, color='b') plt.title("Copy number abundance in "+txt_label+" (uniq genes)") if saveReport: logger.info('Save distplot') plt.savefig(os.path.join( output_directory, 'Fig_distplot_' + select_samples_title+fext[i_data]+img_ext), transparent=True, bbox_inches='tight', pad_inches=0.1, frameon=False) plt.close("all") else: plt.show() # distplot break Y-axis logger.info('Plotting break Y-axis distplot..') _, uniq_count = np.unique(choose_data.values.flatten(), return_counts=True) ymax_bottom = int(math.ceil( np.sort(uniq_count)[-2] / 1000.0) ) * 1000 ymax_top = int(math.ceil( np.sort(uniq_count)[-1] / 10000.0) ) * 10000 distplot_breakYaxis(choose_data.values, ymax_bottom, ymax_top, color='b', d=0.005, pad=1.5, figsize=(10, 6), mytitle='Copy number abundance in '+txt_label + 'with cropped y axis (uniq genes)') if saveReport: logger.info('Save distplot') plt.savefig(os.path.join( output_directory, 'Fig_distplot_breakYaxis_' + select_samples_title+fext[i_data]+img_ext), transparent=True, bbox_inches='tight', pad_inches=0.1, frameon=False) plt.close("all") else: plt.show() # Plot heatmap _figure_x_size, _figure_y_size, _show_gene_names, _ = \ set_heatmap_size(data) plt.figure(figsize=(_figure_x_size, _figure_y_size)) ax = sns.heatmap(choose_data, vmin=vmin, vmax=vmax, yticklabels=pat_labels_txt, xticklabels=False, cmap=cmap_custom, cbar=False) if (_show_gene_names and ( (xpos_choose[i_data] is None) or (xlabels_choose[i_data] is None))): plt.xticks(rotation=90) elif ( (xpos_choose[i_data] is not None) and (xlabels_choose[i_data] is not None)): plt.xticks(xpos_choose[i_data], xlabels_choose[i_data], rotation=0) plt.xlabel('chromosomes (the number is aligned at the end ' + 'of the chr region)') plt.ylabel('samples '+select_samples_title+'\n'+pat_labels_title) cbar = ax.figure.colorbar(ax.collections[0]) set_cbar_ticks(cbar, function_dict, custom_div_cmap_arg) plt.title( txt_label+'\nheatmap of ' + select_samples_title+' samples') if saveReport: logger.info('Save heatmap') plt.savefig(os.path.join( output_directory, 'Fig_heatmap_'+select_samples_title + fext[i_data]+img_ext), transparent=True, bbox_inches='tight', pad_inches=0.1, frameon=False) plt.close("all") else: plt.show() # Plot pairwise sample correlations data_cor = 1-squareform(pdist(choose_data, 'correlation')) plt.figure(figsize=(15, 10)) sns.heatmap(data_cor, vmin=-1, vmax=1, yticklabels=pat_labels_txt, xticklabels=pat_labels_txt, cmap='PiYG', square=True) plt.xlabel("samples "+select_samples_title) plt.ylabel(pat_labels_title) plt.title("Auto-corerelation of "+select_samples_title + " samples - "+txt_label) if saveReport: logger.info('Save heatmap') plt.savefig(os.path.join( output_directory, 'Fig_corr_'+select_samples_title + fext[i_data]+img_ext), transparent=True, bbox_inches='tight', pad_inches=0.1, frameon=False) plt.close("all") else: plt.show()