def gen_dcna(exports_config: Config.Config, study_config: Config.Config, verb): # This is dCNA # Requires cCNA to be generated already helper.working_on(verb, message='Gathering files ...') l_o_file = os.path.join( study_config.config_map['output_folder'], 'data_{}.txt'.format( constants.config2name_map['CONTINUOUS_COPY_NUMBER'])) c_o_file = os.path.join( study_config.config_map['output_folder'], 'data_{}.txt'.format( constants.config2name_map[exports_config.type_config])) global thresholds thresholds = [ float(x) for x in exports_config.config_map['thresholds'].split(',') ] if os.path.exists(l_o_file): helper.working_on(verb, message='Generating dCNA (CNA)...') data = pd.read_csv(l_o_file, sep='\t') cols = data.columns.values.tolist()[1:] # This code here had an astonishing 5500x improvement compared to traversal over it as a 2D array, and yes 5500x for c in cols: data[c] = data[c].apply(lambda x: collapse(x)) data.to_csv(c_o_file, sep='\t', index=None) else: print( 'ERROR:: Cannot generate dCNA file because log2CNA file does not exist ...' ) print( 'ERROR:: Either remove the DISCRETE data config file, or add a CONTINUOUS data config file ' ) helper.stars() helper.stars() exit(1)
def gen_log2cna(exports_config: Config.Config, study_config: Config.Config, janus_path, verb): # TODO janus_path argument is not used, can remove; replace verb with logger helper.working_on(verb, message='Gathering files ...') seg_file = os.path.join( study_config.config_map['output_folder'], 'data_{}.txt'.format(constants.config2name_map['SEG'])) bed_file = exports_config.config_map['bed_file'] l_o_file = os.path.join( study_config.config_map['output_folder'], 'data_{}.txt'.format( constants.config2name_map[exports_config.type_config])) helper.working_on(verb, message='Generating log2CNA...') executable = 'Rscript' r_script_path = os.path.join(os.dirname(__file__), R_SCRIPT_DIRNAME, 'seg2gene.r') if os.path.exists(r_script_path): cmd = ', '.join( [executable, r_script_path, seg_file, bed_file, l_o_file]) logger.debug('Running R script command: ' + cmd) rc = subprocess.call(cmd) if rc != 0: msg = "Non-zero exit code %i from R script command '%s'" % (rc, cmd) raise ValueError(msg) else: raise FileNotFoundError( 'Cannot find R script path {}'.format(r_script_path))
def main(): global meta_config global study_config global janus_path global verb helper.working_on(verb, message='Generating CNA files ...') discrete_copy_number_data.gen_dcna(meta_config, study_config, verb) helper.working_on(verb)
def main(): global meta_config global study_config global janus_path global verb helper.working_on(verb, message='Generating log2CNA files ...') continuous_copy_number_data.gen_log2cna(meta_config, study_config, janus_path, verb) helper.working_on(verb)
def get_sample_ids(exports_config: Config.Config, verb) -> pd.Series: data = pd.read_csv(os.path.join(exports_config.config_map['input_folder'], exports_config.data_frame['FILE_NAME'][0]), sep='\t', usecols=['ID']) helper.working_on(verb, message='Parsing importable {} file ...'.format( exports_config.type_config)) return data['ID'].drop_duplicates(keep='first', inplace=False)
def fix_hmmcopy_tsv(exports_config: Config.Config, study_config: Config.Config, verb): # Fix the header # Gather ingredients calls = [] output_folder = study_config.config_map['output_folder'] input_folder = exports_config.config_map['input_folder'] export_data = exports_config.data_frame #input(export_data) seg_temp = helper.get_temp_folder(output_folder, 'seg') bed_filter = subprocess.check_output([ 'awk "NR>1" {} | ' 'awk -F"\\t" \'{{print $1}}\' | ' 'uniq'.format(exports_config.config_map['bed_file']) ], shell=True).decode("utf-8") #input(bed_filter) bed_filter = bed_filter.strip().split('\n') bed_filter = bed_filter + ['chr' + a for a in bed_filter] bed_filter = ['\\t' + a + '\\t' for a in bed_filter] #input(bed_filter) header = 'ID\\tchrom\\tloc.start\\tloc.end\\tnum.mark\\tseg.mean' for i in range(len(export_data)): input_file = os.path.join(input_folder, export_data['FILE_NAME'][i]) output_file = os.path.join(seg_temp, export_data['FILE_NAME'][i]) sample_id = export_data['SAMPLE_ID'][i] helper.working_on( verb, 'Refactoring cols: {}'.format(export_data['FILE_NAME'][i])) output_temp = output_file + '.temp' # Get all the genes in the .bed; save each line with a matching gene; rename the Sample_ID # TODO get rid of this ugly & fragile bash script, rewrite using Python # See comments by LEH in earlier commit columns = '1' # placeholder for num.mark columns cmd = 'echo "{}" > {}; '.format(header, output_temp) +\ 'cat {} | '.format(input_file) +\ 'awk \'BEGIN{{split("{}",t); '.format('|'.join(bed_filter))+\ 'for (i in t) vals[t[i]]}} ($2 in vals)\' | '+\ 'awk -F"\\t" \'{{ OFS="\\t"; '+\ 'print "{}", $2, $3, $4, {}, $5}}\' >> {}; '.format(sample_id, output_temp, columns) +\ 'mv {} {}'.format(output_temp, output_file) calls.append(subprocess.Popen(cmd, shell=True)) exports_config.config_map['input_folder'] = seg_temp exit_codes = [p.wait() for p in calls] if any(exit_codes): raise ValueError( 'ERROR:: Something went wrong when parsing HMMCopy format file? Please resolve the issue' ) if verb: print(exit_codes)
def verify_final_seg_file(exports_config: Config.Config, verb): seg = open( os.path.join(exports_config.config_map['input_folder'], exports_config.data_frame['FILE_NAME'][0]), 'w') header = seg.readline().strip().split('\t') minimum_header = [ 'ID', 'chrom', 'loc.start', 'loc.end', 'num.mark', 'seg.mean' ] helper.working_on(verb, message='Asserting minimum header is in SEG file.') if not all([a in header for a in minimum_header]): print([a if a not in header else '' for a in minimum_header]) print( 'Missing headers from SEG file have been printed above, please ensure the data is not missing.' ) exit(1)
def verify_final_discrete_file(exports_config: Config.Config, verb): data = open( os.path.join(exports_config.config_map['input_folder'], exports_config.data_frame['FILE_NAME'][0]), 'w') t_config = exports_config.type_config header = data.readline().strip().split('\t') minimum_header = ['Entrez_Gene_Id', 'Hugo_Symbol'] helper.working_on( verb, message='Asserting minimum header is in {} file.'.format(t_config)) if not any([a in header for a in minimum_header]): print([a if a not in header else '' for a in minimum_header]) print( 'Missing header(s) from {} file have been printed above, ensure data isn\'t missing.' .format(t_config)) exit(1)
def generate_expression_zscore(meta_config: Config.Config, input_file, outputPath, gepcomp, tcga, verb): # Z-Scores written by Dr. L Heisler helper.working_on(verb, message='Reading FPKM Matrix ...') try: raw_data = pd.read_csv(input_file, sep='\t') except FileNotFoundError: print('{} wrong file or file path'.format(input_file)) raise helper.working_on(verb, message='Processing FPKM Matrix ...') raw_scores = raw_data.drop(['Hugo_Symbol'], axis=1) means = raw_scores.mean(axis=1) sds = raw_scores.std(axis=1) z_scores = ((raw_scores.transpose() - means) / sds).transpose() z_scores = z_scores.fillna(0) z_scores_data = z_scores.round(decimals=4) z_scores_data = pd.concat([raw_data['Hugo_Symbol'], z_scores_data], axis=1) helper.working_on(verb, message='Writing FPKM Z-Scores Matrix ...') # Reformat the columns for comparison and TCGA data to keep only the columns used in the study mRNA expression continuous data if gepcomp or tcga: study_columns = [] for k in range(meta_config.data_frame.shape[0]): study_columns.append(meta_config.data_frame['SAMPLE_ID'][k]) study_columns.insert(0,'Hugo_Symbol') z_scores_data = z_scores_data[study_columns] # Create the supplementary_data directory if it doesn't exist if not os.path.exists(os.path.join(outputPath, 'supplementary_data')): os.makedirs(os.path.join(outputPath, 'supplementary_data'), exist_ok=True) if gepcomp: # Output comparison Z scores output_file_z_scores = os.path.join(outputPath, 'supplementary_data', 'data_{}_comparison.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE'])) z_scores_data.to_csv(output_file_z_scores, sep="\t", index=False) # Delete all gepcomp files that are not in the supplementary folder os.remove(input_file) elif tcga: # Output TCGA Z scores output_file_z_scores = os.path.join(outputPath, 'supplementary_data', 'data_{}_tcga.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE'])) z_scores_data.to_csv(output_file_z_scores, sep="\t", index=False) # Delete all TCGA files that are not in the supplementary folder os.remove(input_file) else: # Output study Z scores output_file_z_scores = os.path.join(outputPath, 'data_{}.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE'])) z_scores_data.to_csv(output_file_z_scores, sep="\t", index=False)
def main(): global meta_config global study_config global janus_path global verb helper.working_on( verb, message='Gathering and decompressing SEG files into temporary folder') helper.decompress_to_temp(meta_config, study_config, verb) helper.working_on(verb) helper.working_on( verb, message= 'Fixing HMMCopy formatting, chromosome, and chromosome max-length ...') fix_hmmcopy_tsv(meta_config, study_config, verb) fix_chrom(meta_config, study_config, verb) ### fix_hmmcopy_max_chrom fixes the maximum chromosome length AND imputes the num.mark value fix_hmmcopy_max_chrom(meta_config, study_config, janus_path, verb) helper.working_on(verb) helper.working_on(verb, message='Fixing .SEG IDs') fix_seg_id(meta_config, study_config, verb) helper.working_on(verb) helper.working_on(verb, message='Concating SEG Files to export folder') helper.concat_files(meta_config, study_config, verb) helper.working_on(verb)
def main(): global meta_config global study_config global janus_path global verb helper.working_on( verb, message= 'Gathering and decompressing MRNA_EXPRESSION files into temporary folder' ) helper.decompress_to_temp(meta_config, study_config, verb) helper.working_on(verb) helper.working_on(verb, message='Alpha sorting each file ...') alpha_sort(meta_config, verb) helper.working_on(verb) helper.working_on(verb, message='Generating expression matrix ...') generate_expression_matrix(meta_config, study_config, verb) helper.working_on(verb) # Works because shorting ... if 'zscores' in meta_config.config_map.keys( ) and meta_config.config_map['zscores'].lower() == 'true': helper.working_on(verb, message='Generating expression Z-Score Meta ...') meta.generate_meta_type(meta_config, study_config, verb) #meta.generate_meta_type(meta_config.alterationtype + '_ZSCORES', # {'profile_name': 'mRNA expression z-scores','profile_description': 'Expression level z-scores'}, study_config, verb) helper.working_on(verb) helper.working_on(verb, message='Generating expression Z-Score Data ...') generate_expression_zscore(meta_config, study_config, verb) helper.working_on(verb)
def main(): global meta_config global study_config global janus_path global logger # imports are moved into the main (and only) method to work with the legacy component class import logging import os from support import helper from generate import meta from generate.analysis_pipelines.MRNA_EXPRESSION.support_functions import alpha_sort, generate_expression_matrix, generate_expression_percentile, generate_expression_zscore, preProcRNA from constants.constants import config2name_map from utilities.constants import DATA_DIRNAME verb = logger.isEnabledFor(logging.INFO) # TODO replace the 'verb' switch with logger if meta_config.config_map.get('genelist'): genelist = meta_config.config_map.get('genelist') else: genelist = os.path.join(os.path.dirname(__file__), DATA_DIRNAME, 'targeted_genelist.txt') if meta_config.config_map.get('enscon'): enscon = meta_config.config_map.get('enscon') else: enscon = os.path.join(os.path.dirname(__file__), DATA_DIRNAME, 'ensemble_conversion.txt') logger.info('Started processing data for CAP_expression pipeline') logger.info('Decompressing MRNA_EXPRESSION files to temporary folder') meta_config = helper.relocate_inputs(meta_config, study_config, verb) logger.info('Alpha sorting each file ...') alpha_sort(meta_config, verb) logger.info('Generating expression matrix ...') generate_expression_matrix(meta_config, study_config, verb) #preProcRNA - generate processed continuous data using the generated expression matrix - one for study and one for study comparison and one for TCGA data preProcRNA(meta_config, study_config, '/data_{}_gepcomp.txt'.format(config2name_map[meta_config.alterationtype + ":" + meta_config.datahandler]), enscon, genelist, True, False) preProcRNA(meta_config, study_config, '/data_{}.txt'.format(config2name_map[meta_config.alterationtype + ":" + meta_config.datahandler]), enscon, genelist, False, True) if meta_config.config_map.get('zscores'): # Generate the z-scores for mRNA expression data logger.info('Generating expression Z-Score Data ...') generate_expression_zscore(meta_config, os.path.join(study_config.config_map['output_folder'], 'data_{}.txt'.format(config2name_map[meta_config.alterationtype + ":" + meta_config.datahandler])), study_config.config_map['output_folder'] , False, False, verb) # Generate the mRNA expression percentile data logger.info('Generating expression Percentile Data ...') generate_expression_percentile(meta_config, os.path.join(study_config.config_map['output_folder'], 'data_{}.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE'])), study_config.config_map['output_folder'] , False, False, verb) # Generate the z-score sfor mRNA expression comparison data logger.info('Generating expression Z-Score comparison Data ...') generate_expression_zscore(meta_config, os.path.join(study_config.config_map['output_folder'], 'data_{}_gepcomp.txt'.format(config2name_map[meta_config.alterationtype + ":" + meta_config.datahandler])), study_config.config_map['output_folder'] , True, False, verb) # Generate the mRNA expression comparison percentile data logger.info('Generating expression Percentile comparison Data ...') generate_expression_percentile(meta_config, os.path.join(study_config.config_map['output_folder'], 'data_{}.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE'])), study_config.config_map['output_folder'] , True, False, verb) # Generate the z-scores for mRNA expression TCGA data helper.working_on(verb, message='Generating expression TCGA Z-Score Data ...') generate_expression_zscore(meta_config, os.path.join(study_config.config_map['output_folder'], 'data_{}_tcga.txt'.format(config2name_map[meta_config.alterationtype + ":" + meta_config.datahandler])), study_config.config_map['output_folder'] , False, True, verb) # Generate the TCGA mRNA expression percentile data logger.info('Generating expression TCGA Percentile Data ...') generate_expression_percentile(meta_config, os.path.join(study_config.config_map['output_folder'], 'supplementary_data', 'data_{}_tcga.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE'])), study_config.config_map['output_folder'] , False, True, verb) # Generate meta data within the handler and not in generator.py # Generate metadata for mRNA expression continuous data logger.info('Generating expression Meta ...') meta.generate_meta_type(meta_config,study_config,logger) # Generate metadata for mRNA expression z-score data if meta_config.config_map.get('zscores'): logger.info('Generating expression Z-Score Meta ...') meta_config.datahandler = 'Z-SCORE' meta.generate_meta_type(meta_config,study_config,logger) logger.info('Finished processing data for CAP_expression pipeline')
def generate_expression_matrix(exports_config: Config.Config, study_config: Config.Config, verb): # Output for data_expression_continuous_expression.txt data file output_file = os.path.join(study_config.config_map['output_folder'], 'data_{}.txt'.format(config2name_map[exports_config.alterationtype + ":" + exports_config.datahandler])) helper.working_on(verb, message='Reading FPKM data ...') info: DataFrames = [] for i in range(exports_config.data_frame.shape[0]): info.append(pd.read_csv(os.path.join(exports_config.config_map['input_folder'], exports_config.data_frame['FILE_NAME'][i]), sep='\t', usecols=['gene_id','FPKM']) .rename(columns={'FPKM': exports_config.data_frame['SAMPLE_ID'][i], 'gene_id': 'Hugo_Symbol'}) .drop_duplicates(subset='Hugo_Symbol', keep='last', inplace=False)) helper.working_on(verb, message='Merging all FPKM data ...') if len(info) == 0: raise ImportError('Attempting to import zero expression data, please remove expression data from study.') elif len(info) == 1: result = info[0] else: result = info[0] for i in range(1, len(info)): result: pd.DataFrame = pd.merge(result, info[i], how='outer', on='Hugo_Symbol') result.drop_duplicates(subset='Hugo_Symbol', keep='last', inplace=True) result.replace(np.nan, 0, inplace=True) helper.working_on(verb, message='Writing all FPKM data ...') result.to_csv(output_file, sep='\t', index=None) # Append the gepcomp datafiles (if any) gep_file = exports_config.config_map.get('gepfile') if gep_file != None and os.path.exists(gep_file): geplist = pd.read_csv(gep_file, sep=',') geplist.columns = ['patient_id', 'file_name'] # Filter out the patient ID's that have already been included in the study indices = [] for i in range(exports_config.data_frame.shape[0]): for a, elem in enumerate(geplist.patient_id.tolist()): if exports_config.data_frame['PATIENT_ID'][i] in elem: indices.append(a) geplist = geplist.drop(indices) for index, row in geplist.iterrows(): info.append(pd.read_csv(row.file_name, sep='\t', usecols=['gene_id','FPKM']) .rename(columns={'FPKM': row.patient_id, 'gene_id': 'Hugo_Symbol'}) .drop_duplicates(subset='Hugo_Symbol', keep='last', inplace=False)) helper.working_on(verb, message='Merging all FPKM data ...') if len(info) == 0: raise ImportError('Attempting to import zero expression data, please remove expression data from study.') elif len(info) == 1: result = info[0] else: result = info[0] for i in range(1, len(info)): result: pd.DataFrame = pd.merge(result, info[i], how='left', on='Hugo_Symbol') result.drop_duplicates(subset='Hugo_Symbol', keep='last', inplace=True) result.replace(np.nan, 0, inplace=True) helper.working_on(verb, message='Writing all FPKM data ...') # Output the gepcomp data output_file_comp = os.path.join(study_config.config_map['output_folder'], 'data_{}_gepcomp.txt'.format(config2name_map[exports_config.alterationtype + ":" + exports_config.datahandler])) result.to_csv(output_file_comp, sep='\t', index=None)
def main(): global meta_config global study_config global janus_path global verb helper.working_on(verb, message='Gathering and decompressing SEG files into temporary folder') helper.decompress_to_temp(meta_config, study_config, verb) helper.working_on(verb) helper.working_on(verb, message='Fixing Chromosome numbering ...') fix_chrom(meta_config, study_config, verb) helper.working_on(verb) helper.working_on(verb, message='Fixing .SEG IDs') fix_seg_id(meta_config, study_config, verb) helper.working_on(verb) helper.working_on(verb, message='Concating SEG Files to export folder') helper.concat_files(meta_config, study_config, verb) helper.working_on(verb)