def main(): basedir, clinical_dir, outdir = get_options() data_files = os.listdir(basedir) data_files = util.remove_extraneous_files(data_files) data_files_by_cancer_type = { util.get_cancer_type(f): f for f in data_files } clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) inputs = [] for clinical in clinical_files: cancer_type = clinical.split('.')[0] data_file = data_files_by_cancer_type[cancer_type] # make_zscores(os.path.join(basedir, data_file), # os.path.join(clinical_dir, clinical), # outdir) inputs.append((os.path.join(basedir, data_file), os.path.join(clinical_dir, clinical), outdir)) p = Pool(10) p.map(multiprocess_zscores, inputs)
def main(): clinical_dir, output_dir, header_file = get_options() headers = pd.read_csv(header_file, index_col=0, header=None) clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) zscore_data = {} for f in clinical_files: clinical_path = os.path.join(clinical_dir, f) cancer_type = util.get_cancer_type(f) purity_header = headers.get_value(cancer_type, 1) clinical = util.get_clinical_data(clinical_path, extra_rows=[purity_header]) cox_dict = analysis.do_cox(clinical.time, clinical.censor, clinical[purity_header]) zscore_data[cancer_type] = cox_dict purity_mean = clinical[purity_header].mean() print purity_mean clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean, 0, 1) analysis.do_km(cancer_type, clinical.time, clinical.censor, clinical.km_split, output_dir) out_df = pd.DataFrame(zscore_data).transpose() out_df.to_csv(os.path.join(output_dir, 'simple_purity_zscores.csv'))
def main(): indir, outdir = get_options() directories = os.listdir(indir) print indir, outdir directories = util.remove_extraneous_files(directories) directories.remove('output') for d in directories[2:]: print d cna_glob = os.path.join(indir, d, '*.cnv.*') print cna_glob cna_file = glob.glob(cna_glob)[0] cna = pd.read_csv(cna_file, index_col=0, sep=util.get_sep_from_filename(cna_file)) clinical_glob = os.path.join(indir, d, '*clinical.*') clinical_file = glob.glob(clinical_glob)[0] clinical = pd.read_csv(clinical_file, sep=util.get_sep_from_filename(clinical_file), index_col=0) clinical = clinical[['Time', 'Censor']] mut_glob = os.path.join(indir, d, '*mutations*') mut_file = glob.glob(mut_glob)[0] mut = pd.read_csv(mut_file, sep=util.get_sep_from_filename(mut_file), low_memory=False) mutations = prep_mutations(d, mut, clinical) do_single_cancer_type_cna(d, clinical, cna, outdir) do_single_cancer_type_mutation(d, clinical, mutations, outdir)
def main(argv=None): mutation_dir, clinical_dir, outdir = get_options() clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files] p = Pool(1) args = [] pancan = {} for clinical in clinical_files: cancer_type = util.get_cancer_type(clinical) print cancer_type mutation = glob.glob( os.path.join(mutation_dir, '*' + cancer_type + '*'))[0] clinical_data = util.get_clinical_data(clinical) #args.append((mutation, clinical_data, outdir)) pancan[cancer_type] = calculate_cox(mutation, clinical_data, outdir) #print args #p.map(multiprocess_zscores, args) pancan_df = pd.DataFrame(pancan) pancan_df = pancan_df.transpose() pancan_df.to_csv(os.path.join(outdir, 'pancan.csv'))
def main(argv=None): cn_change_size_dir, clinical_dir, outdir = get_options() input_files = os.listdir(cn_change_size_dir) input_files = util.remove_extraneous_files(input_files) input_files = [os.path.join(cn_change_size_dir, i) for i in input_files] zscore_inputs = [] results = [] for input_file in input_files: cancer_type = os.path.split(input_file)[1].split('_')[0] gene = os.path.split(input_file)[1].split('_')[1].split('.')[0] print cancer_type, gene # zscore_inputs.append([input_file, cancer_type, gene]) results.append(multiprocess_zscores([input_file, cancer_type, gene])) #p = Pool(4) #results = p.map(multiprocess_zscores, zscore_inputs) with open(os.path.join(outdir, 'cox_any_change_results.csv'), 'w') as out: formatstr = '{},{},{},{}\n' out.write('Cancer Type,Gene,Z Score,Count\n') for cox_dict in results: cancer_type_gene = cox_dict.keys()[0] print cancer_type_gene print cox_dict[cancer_type_gene] d = cox_dict[cancer_type_gene] out.write( formatstr.format( cancer_type_gene.split('_')[0], cancer_type_gene.split('_')[1], d['z'], d['any_change_count']))
def main(): clinical_dir, output_dir, extra_data_dir = get_options() clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) zscore_data = {} for f in clinical_files: clinical_path = os.path.join(clinical_dir, f) cancer_type = util.get_cancer_type(f) if cancer_type == 'COADREAD': extra_data = prep_extra_data(extra_data_dir, 'COAD') else: extra_data = prep_extra_data(extra_data_dir, cancer_type) clinical = util.get_clinical_data(clinical_path) clinical = clinical.join(extra_data) purity_header = 'Purity_InfiniumPurify' cox_dict = analysis.do_cox(clinical.time, clinical.censor, clinical[purity_header]) zscore_data[cancer_type] = cox_dict purity_mean = clinical[purity_header].mean() print purity_mean clinical['km_split'] = np.where(clinical[purity_header] <= purity_mean, 0, 1) analysis.do_km(cancer_type, clinical.time, clinical.censor, clinical.km_split, output_dir) out_df = pd.DataFrame(zscore_data).transpose() out_df.to_csv( os.path.join(output_dir, 'add_data_simple_purity_zscores.csv'))
def main(argv=None): mutation_dir, clinical_dir, outdir, univariate_output = get_options() clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files] p = Pool(16) args = [] for clinical in clinical_files: cancer_type = util.get_cancer_type(clinical) print cancer_type mutation = glob.glob(os.path.join(mutation_dir, '*' + cancer_type + '*'))[0] univariate_file = None if univariate_output: univariate_file = glob.glob(os.path.join(univariate_output, cancer_type, cancer_type + '.zscores.out.csv'))[0] print univariate_file clinical_data = util.get_clinical_data(clinical) cancer_type_outdir = os.path.join(outdir, cancer_type) if not os.path.isdir(cancer_type_outdir): os.makedirs(cancer_type_outdir) args.append((mutation, clinical_data, cancer_type_outdir, univariate_file)) # calculate_cox(mutation, clinical_data, cancer_type_outdir, univariate_file=univariate_file) print args p.map(multiprocess_zscores, args)
def main(argv=None): mutation_dir, clinical_dir, outdir, tumor_stage_dir = get_options() clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) clinical_files = [os.path.join(clinical_dir, f) for f in clinical_files] p = Pool(16) args = [] for clinical in clinical_files: cancer_type = util.get_cancer_type(clinical) print cancer_type mutation = glob.glob( os.path.join(mutation_dir, '*' + cancer_type + '*'))[0] tumor_stage = os.path.join(tumor_stage_dir, cancer_type + '_clinical.csv') if not os.path.isfile(tumor_stage): continue clinical_data = util.get_clinical_data(clinical) cancer_type_outdir = os.path.join(outdir, cancer_type) if not os.path.isdir(cancer_type_outdir): os.makedirs(cancer_type_outdir) args.append((mutation, clinical_data, tumor_stage, cancer_type_outdir)) # calculate_cox(mutation, clinical_data, tumor_stage, cancer_type_outdir) p.map(multiprocess_zscores, args)
def main(): indir, outdir, split_files = get_options() files = os.listdir(os.path.join(indir, 'cnas')) files = util.remove_extraneous_files(files) criteria_list = [stouffer_sig_and_two_zscore_sig] for criteria in criteria_list: if not split_files: outfile_c = open(os.path.join(outdir, criteria.__name__ + '.out.csv'), 'w') outfile_m = outfile_c for f in files: cancer_type = f.split('_')[0] if split_files: outfile_c = os.path.join(outdir, cancer_type + '_CNA_.criteria_met.out.csv') outfile_m = os.path.join(outdir, cancer_type + '_MUT_.criteria_met.out.csv') cna_cancer_type_criteria_met = apply_criteria(os.path.join(indir, 'cnas', f), criteria, 'cna') cna_cancer_type_criteria_met.index = 'cna_' + cna_cancer_type_criteria_met.index cna_cancer_type_criteria_met.to_csv(outfile_c, index_label='CNA_'+cancer_type) mut_cancer_type_criteria_met = apply_criteria(os.path.join(indir, 'mutations', f), criteria, 'mut') print f, mut_cancer_type_criteria_met.index mut_cancer_type_criteria_met.index = 'mut_' + mut_cancer_type_criteria_met.index mut_cancer_type_criteria_met.to_csv(outfile_m, index_label='MUT_'+cancer_type) if not split_files: outfile.close()
def main(): copy_number_loc, clinical, outdir = get_options() cnas = os.listdir(copy_number_loc) cnas = util.remove_extraneous_files(cnas) results = pd.DataFrame() for c in cnas: cancer_type = util.get_cancer_type(c) print cancer_type clinical_file = glob.glob( os.path.join(clinical, '*' + cancer_type + '*.txt'))[0] clin = util.get_clinical_data(clinical_file) patient_breaks = count_breaks(os.path.join(copy_number_loc, c)) patient_breaks = patient_breaks.reset_index() patient_breaks = util.maybe_clear_non_01s(patient_breaks, 'Sample', cancer_type) patient_breaks = util.add_identifier_column(patient_breaks, 'Sample') patient_breaks = patient_breaks.set_index('identifier') patient_breaks = patient_breaks.drop('Sample', axis=1) breaks_and_clin = patient_breaks.join(clin, how='inner') breaks_and_clin.to_csv( os.path.join(outdir, cancer_type + '_breaks.csv')) cox = analysis.do_cox(breaks_and_clin.time, breaks_and_clin.censor, breaks_and_clin.breaks) cox['cancer_type'] = cancer_type results = results.append(cox, ignore_index=True) results.to_csv(os.path.join(outdir, 'cox_results.csv'))
def main(argv=None): cnv_dir, mutation_dir, clinical_dir, gene_list, outdir = get_options() gene_list = pd.read_csv(gene_list, header=None) gene_list = '\'' + gene_list[0] cnv_files = os.listdir(cnv_dir) cnv_files = util.remove_extraneous_files(cnv_files) cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files] mutation_results = pd.DataFrame() cnv_results = pd.DataFrame() for cnv in cnv_files: cancer_type = util.get_cancer_type(cnv) print cancer_type mutation = glob.glob(os.path.join(mutation_dir, cancer_type + '*'))[0] clinical = glob.glob( os.path.join(clinical_dir, '*' + cancer_type + '*'))[0] cnv_results = cnv_results.append( make_cnv_zscores(cnv, clinical, gene_list)) mutation_results = mutation_results.append( make_mutation_zscores(mutation, clinical, gene_list)) mutation_results.to_csv(os.path.join( outdir, 'mutation_zscores_w_hazards_fig1.csv'), index=False) cnv_results.to_csv(os.path.join(outdir, 'cnv_zscores_w_hazards_fig1.csv'), index=False)
def main(argv=None): cnv_dir, rna, mutation_dir, clinical_dir, outdir, input_file = get_options() cnv_files = os.listdir(cnv_dir) cnv_files = util.remove_extraneous_files(cnv_files) cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files] interesting_genes = pd.read_csv(input_file, comment='#') interesting_genes['Gene'] = '\'' + interesting_genes['Gene'] zscore_inputs = [] corr_results = [] for cnv in cnv_files: cancer_type = util.get_cancer_type(cnv) cancer_type_genes = interesting_genes[interesting_genes['Cancer Type'] == cancer_type] if len(cancer_type_genes) == 0: continue print cancer_type print cancer_type_genes rnaseq = glob.glob(os.path.join(rna, cancer_type + '*'))[0] mutation = glob.glob(os.path.join(mutation_dir, cancer_type + '*'))[0] clinical = glob.glob(os.path.join(clinical_dir, '*' + cancer_type + '*'))[0] zscore_inputs.append([cnv, rnaseq, mutation, clinical, outdir, cancer_type_genes]) # corr_results.append(multiprocess_data([cnv, rnaseq, mutation, clinical, outdir, cancer_type_genes])) p = Pool(4) corr_results = p.map(multiprocess_data, zscore_inputs) df = pd.concat(corr_results, verify_integrity=True, axis=1) print df df.to_csv(os.path.join(outdir, 'corr_results.csv'))
def all_cancer_types(copy_number_dir, annotation_file, outdir, parallel_workers=0): copy_number_files = os.listdir(copy_number_dir) copy_number_files = util.remove_extraneous_files(copy_number_files) # returns a dataframe indexed by gene name, with chr number and txstart args = [] annotation_data = process_annotation_file(annotation_file) for c in copy_number_files: infile = os.path.join(copy_number_dir, c) type_name = os.path.basename(infile).split('.')[0] outfile = os.path.join(outdir, type_name + '.cnv.csv') if parallel_workers == 0: # returns a dict of patient_ids => lists of interval trees containing range data for each chromosome patient_data = process_input_file(infile) process_and_write_data(outfile, annotation_data, patient_data) else: args.append((infile, outfile, annotation_data)) if parallel_workers > 0: p = multiprocessing.Pool(parallel_workers) p.map(multiprocess, args)
def main(): clinical_dir, tumor_groups = get_options() files = os.listdir(clinical_dir) files = util.remove_extraneous_files(files) for f in files: cancer_type = util.get_cancer_type(f) clinical_file = os.path.join(clinical_dir, f) tumor_group_file = os.path.join(tumor_groups, cancer_type + '.csv') count_tumor_groups(clinical_file, tumor_group_file)
def main(): basedir, clinical_dir, hypermutated_patients, outdir = get_options() hypermutated = pd.read_csv(hypermutated_patients, header=None, names=['patients']) data_files = os.listdir(basedir) data_files = util.remove_extraneous_files(data_files) data_files_by_cancer_type = {util.get_cancer_type(f): f for f in data_files} clinical_files = os.listdir(clinical_dir) clinical_files = util.remove_extraneous_files(clinical_files) inputs = [] for clinical in clinical_files: cancer_type = clinical.split('.')[0] data_file = data_files_by_cancer_type[cancer_type] make_zscores(os.path.join(basedir, data_file), os.path.join(clinical_dir, clinical), hypermutated, outdir)
def main(argv=None): cnv_dir, mutation_dir, clinical_dir, outdir = get_options() cnv_files = os.listdir(cnv_dir) cnv_files = util.remove_extraneous_files(cnv_files) cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files] zscore_inputs = [] for cnv in cnv_files: cancer_type = util.get_cancer_type(cnv) mutation = glob.glob(os.path.join(mutation_dir, cancer_type + '*'))[0] clinical = glob.glob( os.path.join(clinical_dir, '*' + cancer_type + '*'))[0] zscore_inputs.append([cnv, mutation, clinical, outdir]) p = Pool(4) p.map(multiprocess_zscores, zscore_inputs)
def main(): indir, clinical_dir, outdir = get_options() files = os.listdir(indir) files = util.remove_extraneous_files(files) for f in files: cancer_type = get_cbioportal_cancer_type(f) print cancer_type clinical_file = os.path.join(clinical_dir, cancer_type + '_clinical.csv') cancer_type_outdir = os.path.join(outdir, cancer_type) if not os.path.isdir(cancer_type_outdir): os.makedirs(cancer_type_outdir) clinical = get_cbioportal_clinical(clinical_file) calculate_zscores_for_file(os.path.join(indir, f), clinical, cancer_type_outdir, cancer_type)
def main(argv=None): if argv is None: argv = sys.argv input_directory, clinical_directory, outdir = get_options() cnv_files = os.listdir(input_directory) cnv_files = util.remove_extraneous_files(cnv_files) for cnv in cnv_files: cancer_type = get_cbioportal_cancer_type(cnv) print cancer_type clinical_file = glob.glob(os.path.join(clinical_directory, '*' + cancer_type + '*'))[0] outglob = glob.glob(os.path.join(outdir, cancer_type + '*')) if len(outglob) == 0: print cancer_type make_zscores(os.path.join(input_directory, cnv), clinical_file, outdir)
def main(argv=None): indir, clinical_dir, outdir = get_options() files = os.listdir(indir) files = util.remove_extraneous_files(files) for copy_number in files: cancer_type = get_icgc_cancer_type(copy_number) print cancer_type clinical_file = os.path.join(clinical_dir, cancer_type + '.csv') relevant_clinical = pd.read_csv(clinical_file, index_col=0, low_memory=False)[['Time', 'Censor' ]].astype(float) make_zscores(os.path.join(indir, copy_number), relevant_clinical, outdir)
def main(argv=None): cnv_dir, mutation_dir, clinical_dir, outdir, input_file = get_options() cnv_files = os.listdir(cnv_dir) cnv_files = util.remove_extraneous_files(cnv_files) cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files] interesting_genes = pd.read_csv(input_file, comment='#') print interesting_genes interesting_genes['Gene'] = '\'' + interesting_genes['Gene'] zscore_inputs = [] for cnv in cnv_files: cancer_type = util.get_cancer_type(cnv) cancer_type_genes = interesting_genes[interesting_genes['Cancer Type'] == cancer_type] if len(cancer_type_genes) == 0: continue print cancer_type print cancer_type_genes mutation = glob.glob(os.path.join(mutation_dir, cancer_type + '*'))[0] clinical = glob.glob( os.path.join(clinical_dir, '*' + cancer_type + '*'))[0] zscore_inputs.append( [cnv, mutation, clinical, outdir, cancer_type_genes]) #multiprocess_zscores([cnv, mutation, clinical, outdir, cancer_type_genes]) p = Pool(4) results = p.map(multiprocess_zscores, zscore_inputs) print results with open(os.path.join(outdir, 'cox_results.csv'), 'w') as out: formatstr = '{},{},{},{},{},{},{},{}\n' out.write( 'Cancer Type,Gene,CNA Z Score, CNA P value, Mutation Z score, Mutation P Value, Mutation Count, n\n' ) for coxs in results: cancer_type = coxs.keys()[0] print cancer_type for gene, cox_dict in coxs[cancer_type].iteritems(): print gene, cox_dict out.write( formatstr.format(cancer_type, gene, cox_dict['var-z'], cox_dict['var-p'], cox_dict[gene + '_mutations-z'], cox_dict[gene + '_mutations-p'], cox_dict['mutation_count'], cox_dict['var-n']))
def main(): indir, outdir = get_options() clinical_files = os.listdir(indir) clinical_files = util.remove_extraneous_files(clinical_files) stage_row = 'patient.stage_event.pathologic_stage' for clinical_f in clinical_files: f = os.path.join(indir, clinical_f) cancer_type = util.get_cancer_type(clinical_f) stage_row = tumor_stage_util.TUMOR_STAGE[cancer_type] if stage_row: clinical = util.get_clinical_data(f, extra_rows=[stage_row], extra_rows_numeric=False) clinical[stage_row] = clinical[stage_row].str.strip() print cancer_type print clinical[stage_row].value_counts()
def main(argv=None): cnv_dir, clinical_dir, outdir, input_file = get_options() cnv_files = os.listdir(cnv_dir) cnv_files = util.remove_extraneous_files(cnv_files) cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files] interesting_genes = pd.read_csv(input_file, comment='#') interesting_genes['Gene'] = '\'' + interesting_genes['Gene'] zscore_inputs = [] for cnv in cnv_files: cancer_type = util.get_cancer_type(cnv) cancer_type_genes = interesting_genes[interesting_genes['Cancer Type'] == cancer_type] if len(cancer_type_genes) == 0: continue clinical = glob.glob(os.path.join(clinical_dir, cancer_type + '*'))[0] multiprocess_copy_number_changes([cnv, clinical, outdir, cancer_type_genes])
def count_codons(data, outdir): files = os.listdir(data) files = util.remove_extraneous_files(files) files.remove('HG36_HG37') outdata = [] ncbi_outdata = [] for f in files: file_name = os.path.join(data, f) cancer_type = util.get_cancer_type(file_name) codon_counts = count_codons_in_file(file_name) outdata.append(codon_counts) df = pd.concat(outdata, axis=1, verify_integrity=True) df['sum'] = df.sum(axis=1) df.to_csv('codon_counts.csv', index_label=[ 'Gene', 'Chromosome', 'Start Position', 'Wild Type Allele' ])
def main(): clinical_dir, row_names_file, basedir, interesting_genes, outdir = get_options( ) files = os.listdir(clinical_dir) files = util.remove_extraneous_files(files) clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files} row_names = pd.read_csv(row_names_file, header=0) interesting_genes = pd.read_csv(interesting_genes, header=0, index_col=1) for i, row in row_names.iterrows(): cancer_type = row['cancer_type'] cancer_type_fname = cancer_type print cancer_type clinical_file = clinical_by_cancer_type[cancer_type] clinical_file = os.path.join(clinical_dir, clinical_file) if row['histological_subtype_row'] != 'EXTERNAL': clinical_data = make_clinical_data(clinical_file, row['histological_subtype_row'], outdir) else: subtype_data = prep_BRCA_data(row['external_file'], cancer_type) # subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv')) clinical = util.get_clinical_data(clinical_file) subtype_clinical = clinical.join(subtype_data['subtype'], how='outer') clinical_data = save_subtype_files(subtype_clinical, 'subtype', cancer_type, outdir) cancer_type_fname = 'BRCA_HER2' cna_file = glob.glob(os.path.join(basedir, cancer_type + '*.csv'))[0] cna = pd.read_csv(cna_file, header=0, index_col=0).T genes = '\'' + interesting_genes['Gene'] genes = genes.loc[cancer_type] print genes if type(genes) == str: print cna[[genes]] joined = cna[[genes]].join(clinical_data, how='outer') else: joined = cna[genes].join(clinical_data, how='outer') joined.to_csv(os.path.join(outdir, cancer_type_fname + '.csv'))
def main(): indir, clinical_dir, outdir, hgnc_file = get_options() files = os.listdir(indir) files = util.remove_extraneous_files(files) hgnc = pd.read_csv(hgnc_file, low_memory=False) hgnc = hgnc[['Approved Symbol', 'Ensembl ID(supplied by Ensembl)']] hgnc.columns = ['Symbol', 'Ensembl ID'] hgnc.set_index('Ensembl ID', inplace=True) hgnc['Symbol'] = '\'' + hgnc['Symbol'] for f in files: cancer_type = get_icgc_cancer_type(f) print cancer_type clinical_file = os.path.join(clinical_dir, cancer_type + '.csv') cancer_type_outdir = os.path.join(outdir, cancer_type) if not os.path.isdir(cancer_type_outdir): os.makedirs(cancer_type_outdir) calculate_zscores_for_file(os.path.join(indir, f), clinical_file, cancer_type_outdir, hgnc)
def main(argv=None): cnv_dir, structural_breaks, interesting_genes_file, outdir = get_options() cnv_files = os.listdir(cnv_dir) cnv_files = util.remove_extraneous_files(cnv_files) cnv_files = [os.path.join(cnv_dir, i) for i in cnv_files] interesting_genes = None if interesting_genes_file: interesting_genes = pd.read_csv(interesting_genes_file) zscore_inputs = [] for cnv in cnv_files: cancer_type = util.get_cancer_type(cnv) breaks = glob.glob( os.path.join(structural_breaks, '*' + cancer_type + '*'))[0] zscore_inputs.append([cnv, breaks, interesting_genes, outdir]) make_cn_zscores(cnv, breaks, interesting_genes, outdir) p = Pool(4) p.map(multiprocess_cn_zscores, zscore_inputs)
def main(argv=None): if argv is None: argv = sys.argv input_directory, clinical, outdir, extra_data_dir = get_options() clinical_files = os.listdir(clinical) clinical_files = util.remove_extraneous_files(clinical_files) args = [] for c in clinical_files: cancer_type = util.get_cancer_type(c) print cancer_type clinical_data = util.get_clinical_data(os.path.join(clinical, c)) copy_number = glob.glob( os.path.join(input_directory, cancer_type + '*.csv'))[0] args.append((copy_number, clinical_data, extra_data_dir, outdir)) # make_zscores(copy_number, clinical_data, extra_data_dir, outdir) p = Pool(4) p.map(multiprocess_zscores, args)
def main(argv=None): input_directory, clinical_directory, gene_file, outdir = get_options() gene_list = pd.read_csv(gene_file, header=None)[0].values cnv_files = os.listdir(input_directory) cnv_files = util.remove_extraneous_files(cnv_files) cnv_file = [f for f in cnv_files if 'METABRIC' in f] cnv = cnv_file[0] cancer_type = get_cbioportal_cancer_type(cnv) clinical_file = glob.glob( os.path.join(clinical_directory, '*' + cancer_type + '*'))[0] print cancer_type results = make_zscores(os.path.join(input_directory, cnv), clinical_file, gene_list) results_df = pd.DataFrame(results) results_df = results_df.set_index('gene') results_df.to_csv(os.path.join(outdir, 'metabric_copy_number.csv'))
def main(argv=None): if argv is None: argv = sys.argv input_directory, clinical, outdir, extra_clinical_rows_file = get_options() clinical_files = os.listdir(clinical) clinical_files = util.remove_extraneous_files(clinical_files) all_extra_clinical_rows = pd.read_csv(extra_clinical_rows_file, index_col=0, header=None) for c in clinical_files: cancer_type = util.get_cancer_type(c) extra_rows = [all_extra_clinical_rows.loc[cancer_type][1]] print cancer_type clinical_data = util.get_clinical_data(os.path.join(clinical, c), extra_rows=extra_rows) print clinical_data copy_number = glob.glob(os.path.join(input_directory, cancer_type + '*.csv'))[0] print copy_number make_zscores(copy_number, clinical_data, outdir, extra_rows)
def main(): clinical_dir, row_names_file, outdir = get_options() files = os.listdir(clinical_dir) files = util.remove_extraneous_files(files) clinical_by_cancer_type = {util.get_cancer_type(f): f for f in files} row_names = pd.read_csv(row_names_file, header=0) for i, row in row_names.iterrows(): cancer_type = row['cancer_type'] print cancer_type clinical_file = clinical_by_cancer_type[cancer_type] clinical_file = os.path.join(clinical_dir, clinical_file) if row['histological_subtype_row'] != 'EXTERNAL': make_clinical_data(clinical_file, row['histological_subtype_row'], outdir) else: subtype_data = prep_BRCA_data(row['external_file'], cancer_type) subtype_data.to_csv(os.path.join(outdir, 'BRCA_annotation_subtype_data.csv')) clinical = util.get_clinical_data(clinical_file) subtype_clinical = clinical.join(subtype_data['subtype'], how='outer') save_subtype_files(subtype_clinical, 'subtype', cancer_type, outdir)