def load_matrices(self, load_directory, geneFile=None, patientFile=None, minFreq=0): matrices = os.listdir(load_directory) for file in matrices: _, _, newGenes, newPatients, newGeneToCases, newPatientToGenes = mex.load_mutation_data(load_directory + '/' + file, geneFile=geneFile, patientFile=patientFile, minFreq=minFreq) # check for differences in loaded genes/patient if set.difference(set(newGenes), set(self.genes)): print "Error: loaded genes different from original matrix" print "in file ", file, " in matrix directory ", load_directory exit(1) if set.difference(set(newPatients), set(self.patients)): print "Error: loaded genes different from original matrix" print "in file ", file, " in matrix directory ", load_directory exit(1) # load the new ones in for gene in newGeneToCases: self.geneToCases_perm[gene].append(newGeneToCases[gene]) for patient in newPatientToGenes: self.patientToGenes_perm[patient].append(newPatientToGenes[patient]) print "Number of loaded matrices: ", len(matrices) self.num_permutations = len(matrices) return
def main(): mutationmatrix = '/Users/jlu96/maf/new/BRCA_wustl/BRCA_wustl-som-cna-jl.m2' patientFile = '/Users/jlu96/maf/new/BRCA_wustl/shared_patients.plst' geneFile = None minFreq = 0 outpatientfile = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/Heatmaps/BRCA_patientmatrix_log.csv' numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data( mutationmatrix, patientFile, geneFile, minFreq) # ---- All Gene Frequency Association ------------------------------------------------------------------------- genePvalue_file = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/geneMakeBin/BRCA_genemakebin.csv' genePValue = all_gene_freq_association(geneToCases, patientToGenes) with open(genePvalue_file, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Gene', 'Pvalue', 'Frequency', 'InAvg', 'OutAvg']) for entry in genePValue: writer.writerow(entry)
def main(): mutationmatrix = '/Users/jlu96/maf/new/BRCA_wustl/BRCA_wustl-som-cna-jl.m2' patientFile = '/Users/jlu96/maf/new/BRCA_wustl/shared_patients.plst' geneFile = None minFreq = 0 outpatientfile = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/Heatmaps/BRCA_patientmatrix_log.csv' numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq) # ---- All Gene Frequency Association ------------------------------------------------------------------------- genePvalue_file = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/geneMakeBin/BRCA_genemakebin.csv' genePValue = all_gene_freq_association(geneToCases, patientToGenes) with open(genePvalue_file, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Gene', 'Pvalue', 'Frequency', 'InAvg', 'OutAvg']) for entry in genePValue: writer.writerow(entry)
def run(args): mdictfile = args.mdictfile cdictfile = args.cdictfile mprob = args.mprob cprob = args.cprob cooccur_distance_threshold = args.cooccur_distance_threshold bin_distance_threshold = args.bin_distance_threshold mutationmatrix = args.mutation_matrix newmutationmatrix = args.newmutationmatrix file_prefix = args.output_prefix if not file_prefix: file_prefix = newmutationmatrix geneFile = args.gene_file patientFile = args.patient_file gene_blacklist = args.gene_blacklist_file patient_blacklist = args.patient_blacklist_file minFreq = args.min_freq minCooccur = args.min_cooccur min_cooccurrence_ratio = args.min_cooccurrence_ratio top_percentile = args.top_percentile top_number = args.top_number parallel_compute_number = args.parallel_compute_number filter_cooccur_same_segment = args.filter_cooccur_same_segment fcss_cratiothresh = args.fcss_cratiothresh fcss_mutfreqdiffthresh = args.fcss_mutfreqdiffthresh fcss_mutfreqdiffratiothresh = args.fcss_mutfreqdiffratiothresh fcss_coveragethresh = args.fcss_coveragethresh fcss_probabilitythresh = args.fcss_probabilitythresh gene_segment_file = args.gene_segment_file load_gene_segments = args.load_gene_segments is_gene2seg = args.is_gene2seg gene_bin_entries_file = args.gene_bin_entries_file no_throw_out_extras = args.no_throw_out_extras segment_info_file = args.segment_info_file if not gene_bin_entries_file: gene_bin_entries_file = file_prefix + '_binnedgenes.tsv' if not segment_info_file: segment_info_file = file_prefix + '_SEGMENTINFO.tsv' #----------------------------------------------------- mutations = mex.remove_blacklists(gene_blacklist, patient_blacklist, *mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq)) numGenes, numCases, genes, patients, geneToCases, patientToGenes = mutations print 'Filtered Mutation data: %s genes x %s patients' % (numGenes, numCases) # Load segment info if load_gene_segments: # extra_genes is the genes not found in the segment file. # If throw_out_extras is False, extra_genes will be empty. geneToBin, extra_genes = load_gene_to_bin(gene_segment_file, geneToCases, no_throw_out_extras=no_throw_out_extras, is_gene2seg=is_gene2seg) numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.remove_extra_genes(extra_genes, numGenes, numCases, genes, patients, geneToCases, patientToGenes) else: print "Beginning bin genes by co-occurring pairs. " genepairs = getgenepairs(geneToCases, genes, closer_than_distance=bin_distance_threshold) print "Pairs retrieved. Calculating cooccurring pairs to make bins." cpairsdict, cgenedict = met.complete_cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, fcss_probabilitythresh, minCooccur, cooccur_distance_threshold, fcss_cratiothresh, parallel_compute_number, filter_cooccur_same_segment, fcss_cratiothresh, fcss_mutfreqdiffratiothresh, fcss_coveragethresh, fcss_probabilitythresh) print "Cooccurring pairs calculated." geneToBin = get_gene_bins_cooccur_same_segment(cpairsdict, geneToCases, fcss_cratiothresh, fcss_mutfreqdiffthresh, fcss_mutfreqdiffratiothresh, fcss_coveragethresh, fcss_probabilitythresh, bin_distance_threshold=bin_distance_threshold) # Write these new bins out new_bins = convert_genes_to_bins(genes, geneToBin) write_segment_infos(new_bins, filename=segment_info_file) print "New SEGMENTINFO written to ", segment_info_file write_gene_positions(new_bins) print "New segment positions appended to gene_positions.txt" # Update to the new mutation matrix. geneToBinSet, bin_setToBin = bin_sets_from_geneToBin(genes, geneToBin) newGeneToCases, newPatientToGenes = update_geneToCases_patientToGenes(geneToCases, patientToGenes, bin_setToBin, at_least_half=True) gene_bin_entries = get_gene_bin_entries(geneToCases, newGeneToCases, geneToBinSet, bin_setToBin) if gene_bin_entries_file: met.writeanydict(gene_bin_entries, gene_bin_entries_file) print "Gene bin entries written to ", gene_bin_entries_file # Write the new mutation matrix out. writemutationmatrix(newPatientToGenes, filename=newmutationmatrix)
def main(): mutationmatrix = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.m2' patientFile = '/Users/jlu96/maf/new/OV_broad/shared_patients.plst' cpairfile = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/Binomial/OV_broad-cna-jl-cpairs-min_cohort.txt' partitionfile = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.ppf' load_partitions = True do_min_cohort = True geneFile = None minFreq = 0 test_minFreq = 100 compute_mutex = True include_cohort_info = False num_cohorts_list = [1,3, 5, 7] numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq) print "number of genes is ", numGenes if do_min_cohort: cohort_dict, clusterToProp, min_cohort = partition.load_patient_cohorts(partitionfile, patientToGenes) min_cohort_genes = set.union(*(patientToGenes[p] for p in min_cohort)) print "getting pairs" genepairs = met.getgenepairs(geneToCases, min_cohort_genes, test_minFreq=test_minFreq) print "Number of pairs ", len(genepairs) print "Normal cooccur test" t = time.time() cpairsdict, cgenedict = met.cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, compute_mutex=compute_mutex) print "Normal cooccur done in ", time.time() - t print "Beginning cohorts" t = time.time() cpairsdict = add_BinomP_min_cohort_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict, min_cohort) print "Cohorts done in ", time.time() - t else: genepairs = met.getgenepairs(geneToCases, genes, test_minFreq=test_minFreq) print "Number of pairs ", len(genepairs) print "Normal cooccur test" cpairsdict, cgenedict = met.cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, compute_mutex=compute_mutex) # print "Add binomial probability" # cpairsdict = add_BinomP_all_pairs(cpairsdict, geneToCases, patientToGenes) # undo print "Beginning cohorts" if load_partitions: cohort_dict = partition.load_patient_cohorts(partitionfile) cpairsdict = add_BinomP_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict) else: for num_cohorts in num_cohorts_list: # get cohorts cohort_dict = generate_patient_cohorts(patientToGenes, num_cohorts) cpairsdict = add_BinomP_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict) if include_cohort_info: cpairsdict = add_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict) print "Writing to file..." met.writeanydict(cpairsdict, cpairfile)
def main(): # INDEX BY LOSSES mutationmatrix = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.m2' patientFile = '/Users/jlu96/maf/new/OV_broad/shared_patients.plst' out_file = '/Users/jlu96/conte/jlu/Analyses/CancerMutationDistributions/OV_broad-cna-jl-PMM.csv' partition_file = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.ppf' min_cluster_size = 30 num_init = 9 minComp = 4 maxComp = 6 do_plot = True do_gmm = False do_dna = True num_integrated = 4 do_kmeans = False do_pmm = False geneFile = None minFreq = 0 dna_gene_file = '/Users/jlu96/conte/jlu/Analyses/CancerGeneAnalysis/DNADamageRepair_loss.txt' numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq) p_gene_list = [] with open(dna_gene_file, 'rU') as row_file: reader = csv.reader(row_file, delimiter='\t') for row in reader: p_gene_list.append(row[0]) if do_kmeans: datas = [] for i in np.arange(minComp, maxComp, 1): datas.append(partition_gene_kmeans(geneToCases, patientToGenes, p_gene_list, i, num_bins=50, title=None, do_plot=True)) with open(out_file, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys()) writer.writeheader() for row in datas: writer.writerow(row) if do_dna: cohort_dict = partition_gene_list(patientToGenes, p_gene_list, binary=not bool(num_integrated)) # Make new cohorts over this if num_integrated: cohort_dict = integrate_cohorts(cohort_dict, numCases, num_integrated) cohort_pairings = [(key, cohort_dict[key]) for key in cohort_dict] draw_partitions_cohorts(geneToCases, patientToGenes, cohort_pairings, title='DNADamageGenes', num_bins=100 if mutationmatrix[-9:] == 'cna-jl.m2' else 50) if do_gmm: datas = [] for i in np.arange(minComp, maxComp, 1): datas.append(partition_GMM(patientToGenes, i, num_bins=50, title='GMM size ' + str(i), do_plot=do_plot)) with open(out_file, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys()) writer.writeheader() for row in datas: writer.writerow(row) if do_pmm: datas = [] clusters = [] for num_components in np.arange(minComp, maxComp, 1): best_data, clusterToPatient = best_pmm(patientToGenes, num_components, rand_num=5, far_rand_num=5, min_cluster_size=min_cluster_size) datas.append(best_data) clusters.append(clusterToPatient) # data, lls = partition_pmm(patientToGenes, i, num_bins=50, max_iter=20, rand_init=False, do_plot=True) # datas.append(data) # all_lls.append(lls) # for j in range(num_init): # data, lls = partition_pmm(patientToGenes, i, num_bins=50, max_iter=20) # datas.append(data) # all_lls.append(lls) # os.system('say "Jonathan your program has finished"') # get the best BIC combined = zip(datas, clusters) combined = sorted(combined, key=lambda entry: (entry['MoreThanMin'], entry['BIC'])) datas, clusters = zip(*combined) with open(out_file, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys()) print datas writer.writeheader() for row in datas: writer.writerow(row) best_data = datas[0] clusterToPatient = clusters[0] # code to parition by best clusters with open(partition_file, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Likelihood', best_data['Likelihood']]) writer.writerow(['BIC', best_data['BIC']]) writer.writerow(['NumComponents', best_data['Number']]) writer.writerow(['Cluster', 'Lambda', 'Probability', 'Patients']) for k in clusterToPatient: if k != -1: lam = best_data['Means'][k] p_k = best_data['Probabilities'][k] else: lam = None p_k = None writer.writerow([k, lam, p_k] + list(clusterToPatient[k])) load_patient_cohorts(partition_file)
def run(args): mdictfile = args.mdictfile cdictfile = args.cdictfile mprob = args.mprob cprob = args.cprob cooccur_distance_threshold = args.cooccur_distance_threshold bin_distance_threshold = args.bin_distance_threshold mutationmatrix = args.mutation_matrix newmutationmatrix = args.newmutationmatrix file_prefix = args.output_prefix if not file_prefix: file_prefix = newmutationmatrix geneFile = args.gene_file patientFile = args.patient_file gene_blacklist = args.gene_blacklist_file patient_blacklist = args.patient_blacklist_file minFreq = args.min_freq minCooccur = args.min_cooccur min_cooccurrence_ratio = args.min_cooccurrence_ratio top_percentile = args.top_percentile top_number = args.top_number parallel_compute_number = args.parallel_compute_number filter_cooccur_same_segment = args.filter_cooccur_same_segment fcss_cratiothresh = args.fcss_cratiothresh fcss_mutfreqdiffthresh = args.fcss_mutfreqdiffthresh fcss_mutfreqdiffratiothresh = args.fcss_mutfreqdiffratiothresh fcss_coveragethresh = args.fcss_coveragethresh fcss_probabilitythresh = args.fcss_probabilitythresh gene_segment_file = args.gene_segment_file load_gene_segments = args.load_gene_segments is_gene2seg = args.is_gene2seg gene_bin_entries_file = args.gene_bin_entries_file no_throw_out_extras = args.no_throw_out_extras segment_info_file = args.segment_info_file if not gene_bin_entries_file: gene_bin_entries_file = file_prefix + '_binnedgenes.tsv' if not segment_info_file: segment_info_file = file_prefix + '_SEGMENTINFO.tsv' #----------------------------------------------------- mutations = mex.remove_blacklists( gene_blacklist, patient_blacklist, *mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq)) numGenes, numCases, genes, patients, geneToCases, patientToGenes = mutations print 'Filtered Mutation data: %s genes x %s patients' % (numGenes, numCases) # Load segment info if load_gene_segments: # extra_genes is the genes not found in the segment file. # If throw_out_extras is False, extra_genes will be empty. geneToBin, extra_genes = load_gene_to_bin( gene_segment_file, geneToCases, no_throw_out_extras=no_throw_out_extras, is_gene2seg=is_gene2seg) numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.remove_extra_genes( extra_genes, numGenes, numCases, genes, patients, geneToCases, patientToGenes) else: print "Beginning bin genes by co-occurring pairs. " genepairs = getgenepairs(geneToCases, genes, closer_than_distance=bin_distance_threshold) print "Pairs retrieved. Calculating cooccurring pairs to make bins." cpairsdict, cgenedict = met.complete_cooccurpairs( numCases, geneToCases, patientToGenes, genepairs, fcss_probabilitythresh, minCooccur, cooccur_distance_threshold, fcss_cratiothresh, parallel_compute_number, filter_cooccur_same_segment, fcss_cratiothresh, fcss_mutfreqdiffratiothresh, fcss_coveragethresh, fcss_probabilitythresh) print "Cooccurring pairs calculated." geneToBin = get_gene_bins_cooccur_same_segment( cpairsdict, geneToCases, fcss_cratiothresh, fcss_mutfreqdiffthresh, fcss_mutfreqdiffratiothresh, fcss_coveragethresh, fcss_probabilitythresh, bin_distance_threshold=bin_distance_threshold) # Write these new bins out new_bins = convert_genes_to_bins(genes, geneToBin) write_segment_infos(new_bins, filename=segment_info_file) print "New SEGMENTINFO written to ", segment_info_file write_gene_positions(new_bins) print "New segment positions appended to gene_positions.txt" # Update to the new mutation matrix. geneToBinSet, bin_setToBin = bin_sets_from_geneToBin(genes, geneToBin) newGeneToCases, newPatientToGenes = update_geneToCases_patientToGenes( geneToCases, patientToGenes, bin_setToBin, at_least_half=True) gene_bin_entries = get_gene_bin_entries(geneToCases, newGeneToCases, geneToBinSet, bin_setToBin) if gene_bin_entries_file: met.writeanydict(gene_bin_entries, gene_bin_entries_file) print "Gene bin entries written to ", gene_bin_entries_file # Write the new mutation matrix out. writemutationmatrix(newPatientToGenes, filename=newmutationmatrix)
def main(): # INDEX BY LOSSES mutationmatrix = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.m2' patientFile = '/Users/jlu96/maf/new/OV_broad/shared_patients.plst' out_file = '/Users/jlu96/conte/jlu/Analyses/CancerMutationDistributions/OV_broad-cna-jl-PMM.csv' partition_file = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.ppf' min_cluster_size = 30 num_init = 9 minComp = 4 maxComp = 6 do_plot = True do_gmm = False do_dna = True num_integrated = 4 do_kmeans = False do_pmm = False geneFile = None minFreq = 0 dna_gene_file = '/Users/jlu96/conte/jlu/Analyses/CancerGeneAnalysis/DNADamageRepair_loss.txt' numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data( mutationmatrix, patientFile, geneFile, minFreq) p_gene_list = [] with open(dna_gene_file, 'rU') as row_file: reader = csv.reader(row_file, delimiter='\t') for row in reader: p_gene_list.append(row[0]) if do_kmeans: datas = [] for i in np.arange(minComp, maxComp, 1): datas.append( partition_gene_kmeans(geneToCases, patientToGenes, p_gene_list, i, num_bins=50, title=None, do_plot=True)) with open(out_file, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys()) writer.writeheader() for row in datas: writer.writerow(row) if do_dna: cohort_dict = partition_gene_list(patientToGenes, p_gene_list, binary=not bool(num_integrated)) # Make new cohorts over this if num_integrated: cohort_dict = integrate_cohorts(cohort_dict, numCases, num_integrated) cohort_pairings = [(key, cohort_dict[key]) for key in cohort_dict] draw_partitions_cohorts( geneToCases, patientToGenes, cohort_pairings, title='DNADamageGenes', num_bins=100 if mutationmatrix[-9:] == 'cna-jl.m2' else 50) if do_gmm: datas = [] for i in np.arange(minComp, maxComp, 1): datas.append( partition_GMM(patientToGenes, i, num_bins=50, title='GMM size ' + str(i), do_plot=do_plot)) with open(out_file, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys()) writer.writeheader() for row in datas: writer.writerow(row) if do_pmm: datas = [] clusters = [] for num_components in np.arange(minComp, maxComp, 1): best_data, clusterToPatient = best_pmm( patientToGenes, num_components, rand_num=5, far_rand_num=5, min_cluster_size=min_cluster_size) datas.append(best_data) clusters.append(clusterToPatient) # data, lls = partition_pmm(patientToGenes, i, num_bins=50, max_iter=20, rand_init=False, do_plot=True) # datas.append(data) # all_lls.append(lls) # for j in range(num_init): # data, lls = partition_pmm(patientToGenes, i, num_bins=50, max_iter=20) # datas.append(data) # all_lls.append(lls) # os.system('say "Jonathan your program has finished"') # get the best BIC combined = zip(datas, clusters) combined = sorted(combined, key=lambda entry: (entry['MoreThanMin'], entry['BIC'])) datas, clusters = zip(*combined) with open(out_file, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys()) print datas writer.writeheader() for row in datas: writer.writerow(row) best_data = datas[0] clusterToPatient = clusters[0] # code to parition by best clusters with open(partition_file, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Likelihood', best_data['Likelihood']]) writer.writerow(['BIC', best_data['BIC']]) writer.writerow(['NumComponents', best_data['Number']]) writer.writerow(['Cluster', 'Lambda', 'Probability', 'Patients']) for k in clusterToPatient: if k != -1: lam = best_data['Means'][k] p_k = best_data['Probabilities'][k] else: lam = None p_k = None writer.writerow([k, lam, p_k] + list(clusterToPatient[k])) load_patient_cohorts(partition_file)
def main(): mutationmatrix = '/Users/jlu96/maf/new/PRAD_broad/PRAD_broad-som.m2' patientFile = None #'/Users/jlu96/maf/new/PRAD_broad/shared_patients.plst' geneFile = None #'/Users/jlu96/conte/jlu/REQUIREDFILES_OnlyLoss2/COSMICGenes_OnlyLoss.txt' load_directory = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LoadMatrices' minFreq = 0 num_permutations = 20 binary_perm_method = False Q = 100 write_matrices = True matrixdirectory = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LoadMatrices' #'/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/SARC_broad-som-jl-' + ('matrix' if binary_perm_method else 'network') outmutexfile = matrixdirectory + '/mutex' + str(num_permutations) + str(time.time()) + '.tsv' outcooccurfile = matrixdirectory + '/cooccur' + str(num_permutations) + str(time.time()) + '.tsv' outseedsfile = matrixdirectory + '/seeds' + str(time.time()) + '.tsv' if not os.path.exists(os.path.dirname(matrixdirectory)): os.makedirs(os.path.dirname(matrixdirectory)) numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq) print "numGenes ", numGenes, " and numCases ", numCases for patient in patients: if not patientToGenes[patient]: patientToGenes.pop(patient) print patient, "popped" # Generate Permutation Matrices pm = PermutationMatrices(geneToCases, patientToGenes, num_permutations, Q=Q, matrixdirectory=matrixdirectory, binary_perm_method=binary_perm_method, write_matrices=write_matrices, load_directory=load_directory, geneFile=geneFile, patientFile=patientFile, minFreq=minFreq) # Make list of pairs from highly mutated genes test_genes = [gene for gene in genes if len(geneToCases[gene]) > 5] # for test_gene in test_genes: # print test_gene genepairs = met.getgenepairs(geneToCases, test_genes) print "Number of pairs to test ", len(genepairs) # CALCULATE MUTEX # Create a list of ConditionFunctions that you must later initialize... ConditionFunctions = range(len(genepairs)) mutex_set_condition_function_list = [] # Generate set_condition_function_list for i in range(len(genepairs)): genepair = genepairs[i] condition_dict = {} condition_dict['Genes'] = tuple(genepair) condition_dict['Overlap'] = len(set.intersection(*[geneToCases[gene] for gene in condition_dict['Genes']])) condition_dict['Mutex'] = True ConditionFunctions[i] = Condition([condition_dict]) if [condition_dict] != ConditionFunctions[i].conditions: print condition_dict, ConditionFunctions[i].conditions mutex_set_condition_function_list.append((genepair, ConditionFunctions[i])) print "Finished mutex condition function list" t= time.time() # Calculate pvalues for mutual exclusivity pair_to_mutex = {} pair_to_mutex_network_pvalue = pm.set_to_pvalue(mutex_set_condition_function_list) print "mutex pair network pvalues finished in ", time.time() - t for genepair in genepairs: pair_to_mutex[genepair] = mex.analyze_mutex_set_new(numCases, geneToCases, patientToGenes, genepair) pair_to_mutex[genepair]['NetworkProbability'] = pair_to_mutex_network_pvalue[genepair] # Write to output with open(outmutexfile, 'w') as csvfile: fieldnames = pair_to_mutex[genepairs[0]].keys() writer = csv.DictWriter(csvfile, delimiter='\t', fieldnames=fieldnames) writer.writeheader() for genepair in pair_to_mutex: writer.writerow(pair_to_mutex[genepair]) # CALCULATE COOCCUR cooccur_set_condition_function_list = [] # Generate set_condition_function_list for genepair in genepairs: ConditionFunction = Condition(None) condition_dict = {} condition_dict['Genes'] = tuple(genepair) condition_dict['Overlap'] = len(set.intersection(*[geneToCases[gene] for gene in condition_dict['Genes']])) condition_dict['Mutex'] = False ConditionFunction.set_params([condition_dict]) cooccur_set_condition_function_list.append((genepair, ConditionFunction)) t= time.time() # Calculate pvalues for mutual exclusivity pair_to_cooccur = {} pair_to_cooccur_network_pvalue = pm.set_to_pvalue(cooccur_set_condition_function_list) print "cooccur pair network pvalues finished in ", time.time() - t for genepair in genepairs: pair_to_cooccur[genepair] = mex.analyze_cooccur_set_new(numCases, geneToCases, patientToGenes, genepair) pair_to_cooccur[genepair]['NetworkProbability'] = pair_to_cooccur_network_pvalue[genepair] # Write to output with open(outcooccurfile, 'w') as csvfile: fieldnames = pair_to_cooccur[genepairs[0]].keys() writer = csv.DictWriter(csvfile, delimiter='\t', fieldnames=fieldnames) writer.writeheader() for genepair in pair_to_cooccur: writer.writerow(pair_to_cooccur[genepair]) # Write seeds to output with open(outseedsfile, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t') for seed in pm.seeds: writer.writerow([seed])
def main(): mutationmatrix = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.m2' patientFile = '/Users/jlu96/maf/new/OV_broad/shared_patients.plst' cpairfile = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/Binomial/OV_broad-cna-jl-cpairs-min_cohort.txt' partitionfile = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.ppf' load_partitions = True do_min_cohort = True geneFile = None minFreq = 0 test_minFreq = 100 compute_mutex = True include_cohort_info = False num_cohorts_list = [1, 3, 5, 7] numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data( mutationmatrix, patientFile, geneFile, minFreq) print "number of genes is ", numGenes if do_min_cohort: cohort_dict, clusterToProp, min_cohort = partition.load_patient_cohorts( partitionfile, patientToGenes) min_cohort_genes = set.union(*(patientToGenes[p] for p in min_cohort)) print "getting pairs" genepairs = met.getgenepairs(geneToCases, min_cohort_genes, test_minFreq=test_minFreq) print "Number of pairs ", len(genepairs) print "Normal cooccur test" t = time.time() cpairsdict, cgenedict = met.cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, compute_mutex=compute_mutex) print "Normal cooccur done in ", time.time() - t print "Beginning cohorts" t = time.time() cpairsdict = add_BinomP_min_cohort_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict, min_cohort) print "Cohorts done in ", time.time() - t else: genepairs = met.getgenepairs(geneToCases, genes, test_minFreq=test_minFreq) print "Number of pairs ", len(genepairs) print "Normal cooccur test" cpairsdict, cgenedict = met.cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, compute_mutex=compute_mutex) # print "Add binomial probability" # cpairsdict = add_BinomP_all_pairs(cpairsdict, geneToCases, patientToGenes) # undo print "Beginning cohorts" if load_partitions: cohort_dict = partition.load_patient_cohorts(partitionfile) cpairsdict = add_BinomP_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict) else: for num_cohorts in num_cohorts_list: # get cohorts cohort_dict = generate_patient_cohorts(patientToGenes, num_cohorts) cpairsdict = add_BinomP_cohorts_all_pairs( cpairsdict, geneToCases, patientToGenes, cohort_dict) if include_cohort_info: cpairsdict = add_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict) print "Writing to file..." met.writeanydict(cpairsdict, cpairfile)