def main(): motif_filename = "Gamma_collection.txt" genome_filename = "NC_000913.fna" # Escherichia coli str. K-12 substr. MG1655 top_sites = ["TACTGTATAAATAAACAGTA", "TACTGTATATAAAAACAGTA", "TACTGTATATAAAAACAGTA", "AACTGTATATAAATACAGTT", "TACTGTATATAAAACCAGTT", "TACTGGATAAAAAAACAGTT", "TACTGTATAAAATCACAGTT", "AACTGGATAATCATACAGTA", "CACTGTATAAATAAACAGCT", "TACTGTACATCCATACAGTA", "ACCTGTATAAATAACCAGTA", "TACTGTATGAGCATACAGTA", "TACTGGATATTTAAACAGGT", "AACTGTATATACACCCAGGG", "TGCTGTATATACTCACAGCA", "AACTGGATAAAATTACAGGG", "TACTGTATATTCATTCAGGT", "TACTGTACACAATAACAGTA", "AGCTGAATAAATATACAGCA", "ATCTGTATATATACCCAGCT", "CACTGGATAGATAACCAGCA", "TACTGCATATACAACCAGAA", "CACTGTATACTTTACCAGTG", "AACTGTCGATACGTACAGTA", "TGCTGTACAAACGTCCAGTT", "CGCTGGATATCTATCCAGCA", "TGCTGTTTATTAAACCAGAA", "CATTGTTTATAAAAACAGCA", "TACTGGAGACAAATACAGCT", "ACCTGTATATATCATCAGTA", "TACTGTATAAACAGCCAATA", "TACTGTTTATCTTCCCAGCG", "GACTGTATAAAACCACAGCC", "TATTGTATATATTCACATTA", "AAGTGTATTTACACACAGCG", "AACTGGATAATCATACCGTT", "TGCTGTATGGATTAACAGGA", "TCCTGTATGAAAAACCATTA", "GCCTGTCTGAACAAACAGTA", "AAGTGTATATATATCCATCG", "AATTGTTTAAAAAACCAGAA", "TATTGTATTTATAAACATTA", "AACTGATTAAAAACCCAGCG", "GAGTGTATATAAAGCCAGAA", "TTCTGGATAAGCATCCAGAA", "ACCTGAATATTCAAACAGCG", "TACTGTCTACCAAAACAGAG", "TCCTGTATAAATTAACCGTT", "TAATGAATATAAAACCAGGA", "AACTGTAAATAATTACATGA", "TACTGAATAAAAAAGCAGAA", "TGCTGTACAATCAGCCAGCA", "TGCTGGATTTACGACCAGAA", "TACTGTTTATAAACCGAGCG", "TCTTGTATATCCAACCAGTT", "CACTGTATTAAAAACCATTC", "AACTGTATTACCTTCCAGCC", "TACTGTTTCCATTTACAGCC", "TAGTGGATGTAAAAACATTT", "CACTGTCTATACTTACATGT", "TACTGTTTGTGCAAATAGTA", "AACTGGTTATCAACCCAGAC", "CTCTGTATCTAATTACAGGT", "CACTGAATGCTAAAACAGCA", "TGCTGGATGTGAAACCAGCG", "TGCTGATTAAAAAACCAGCG", "AACTGGATATCTATCCGGAA", "CACTGTCTGATACAACAGTT", "AATTGTTAATATATCCAGAA", "AACTTTATGTACAGCCAGTG", "AACTGGTTATTCCACCAGAA", "GAATGGATAAAAAAACAGCC", "AGCTGTATAAAAATCCTGAG", "TCCTGGCTATTTTGCCAGTA", "AGCTGGCTATCTGAACAGTT", "CACTGGATATTCCTTCAGGT", "GGCTGGATAAAGAACCAGAA", "AACTGAATAAAAACAAAGGA", "TGCTGTACATCAGCACAGAT", "CAGTGGATTAACTTCCAGTT", "TACTGGATACAAAAACGGAT", "AACTATATAAATAAACATAA", "TGCTGCATGAAGAAACAGTA", "GTCTGAATGAATACCCAGTA", "TACTTTATTTACTCCCAGTG", "TACTGGAAACTAACACAGGC", "TACTGGATATCAAACCTGAA", "TACTGTCGGAAAAATCAGTG", "AACTGTATATGTCGCCAGGC", "TACTCTTCATTAAAACAGTG", "GACTGGCTTAATACACAGCC", "CAGTGTAAGAATAGACAGTG", "CGCTGGATGAGCGTACAGCA", "AACGGGATAATAAAACAGCC", "TACCGTTTGTATTTCCAGCT", "CTCTGGACATTAAACCAGGA", "TAGTTTATATAAATTCAGTC", "TACTGTATATTCCTCAAGCG", "AACTGGATATATAAATAATG", "AACGGTTTAAACACCCAGCG", "GCCTGGTTAAATGACCAGCA", "TACTGTTTAGCCAGCCAGTC", "TGCTGAACATTCTTCCAGCA", "TACTGGAAATAAGATCAGCC", "AACTGGATCTTAAAAAAGTA", "GGCTGGACAATTTTACAGCT", "AACTGTCTCTTATGACAGTT", "AACTGGATACAGAAACAATA", "TAGTGTTCAGTTTTACAGTA", "TAGTGTTTGTTCATCCATTA", "CAGTGTTGAAAAGTCCAGTA", "ATCTGGCTGAAATTACAGAA", "GATTGAATGAATATACAGGG", "CACTGGTTTTCCACCCAGCA", "AACTGGTTGAAAAACCATCA", "AACTGGTTTAACTCCCAGGG", "AGCTGGATAAACAAAAAGCG", "AACTGGATAAATTACCGGAT", "AACTGGATTTAAATCCTGGT", "AACTGTATTTACAATCATCT", "TACTCTCTGTTCATCCAGCA", "TACTCTATGCAATAACAGAA", "TGCTGAATGCATTAACAGCG", "CACTGGTTATCTTTACCGTA", "TGGTGGATATTTTTTCAGGA", "AACTGGATATCAATCCGGAT", "AAGTGTAAAATTCTACAGAA", "AACCGGACATAAAGCCAGTA", "CACTGTATAAAAATCCTATA", "AACTCTATATTACCCCAGTT", "ACCAGAATAAACATCCAGTA", "TACTGGATGCATTACCGGTA", "CACTCTTTATAAAACCAGGC", "CAGTGAAAATAAAAACAGGA", "AACGGTTTATCTAGCCAGTA", "TCATGTATGATCATACAGAC", "TGCTGAATATATAAAAAGAG", "ACCTGGATGTCACCACAGTT", "TACTAAATGAAAAAACAGCG", "AACTGGATGAACAACCGGCG", "TTCTTTATACATATCCAGCG", "AAGTGTTTGCTAACACAGCA", "AAGTGTAAAAAATCCCAGCG", "AACTGGATAAAGACACCGCT", "TGCTGTATGGTAAATCAGAA", "AACTGTATGATTTAAAAGAT", "TACGGTATAAAAAGACCGTA", "TGCTGGATATTATCCCATCA", "TACTGTCTGAAGAAGCAGTG", "AACTGAATAAATACCCCGGT", "TGCTGTATGAGTAACCGGTA", "CACTGGAAAAATGCGCAGTA", "AACTGGATAGCTATGCAGAA", "AATTGTAAAAAACAACAGCA", "AACTGTTTATCAACACCGCT", "ACCTGTACCTTAAACCAGGA", "TACTTTATAGTTTCCCAGTT", "ATCTGCATAAAGAACCAGTA", "CCCTCTTTATATTTCCAGTG", "TACTGTTTATTAATGTAGCA", "ATGTGAATGAATATCCAGTT", "AACTGGTTAAAATTAGAGAT", "TACTGTAAGAAAAACCCGCA", "ACCTGGAGAAAGAAACAGCG", "CACTGTTTACCCTGACAGTC", "AACTGGCTCATAACCCAGAA"] top_seqs = [nt2int(seq) for seq in top_sites] top_seqs = top_seqs[0:3] # Truncate to first three for the sake of less output spam # Load genome (validated) genome, __ = load_scaffolds(genome_filename) _genome, __ = metagenomics.load_scaffolds(genome_filename) print "Genomes loaded are the same:", np.array_equal(genome, _genome) print # Ensure the sites are in the genome (validated) print "Finding the sites in the genome:" positions = [] sites = [] strands = [] for seq in top_seqs: matches = np.where(np.all(sliding_window(genome, seq.size) == seq, axis=1)) matches_r = np.where(np.all(sliding_window(genome, seq.size) == wc(seq), axis=1)) # forward strand for match in matches: if match.size > 0 and match[0] not in positions: pos = match[0] seq = genome[pos:pos + 20] print "Found:", int2nt(seq), "->", match[0] positions.append(pos) sites.append(seq) strands.append(0) # forward # reverse strand for match in matches_r: if match.size > 0 and match[0] not in positions: pos = match[0] seq = wc(genome[match[0]:match[0] + 20]) print "Found:", int2nt(seq), "<-", match[0] positions.append(pos) sites.append(seq) strands.append(1) # reverse print # Load PSSM (validated) genome_frequencies = np.bincount(genome).astype(np.float) / genome.size _pssm = create_pssm(motif_filename, genome_frequencies=genome_frequencies) _pssm2 = gpu_pssm.create_pssm(motif_filename, genome_frequencies=genome_frequencies) print "PSSMs are the same:", np.array_equal(_pssm, _pssm2) print # Score sites (validated!!) print "Scoring sites (method score seq strand pos):" for n, site in enumerate(sites): print "True:", score_site(site, _pssm), int2nt(site), strands[n], positions[n] print "CPU: ", cpu_score_site(site, _pssm), int2nt(site), strands[n], positions[n] print "GPU: ", gpu_score_site(site, _pssm), int2nt(site), strands[n], positions[n] print
def main(): ##### Parameters ##### #Want to run with promoter regions promoter = True # Path to the MetaHit database. # The patient files can be downloaded from: # http://www.bork.embl.de/~arumugam/Qin_et_al_2010/ # Make sure these are extracted from their packages! (i.e.: "gunzip *.gz") # Note: After extraction, the 84 patient files occupy a total of 6.56 GB on # disk! # The original paper can be found at: # http://www.nature.com/nature/journal/v464/n7285/full/nature08821.html if promoter: metahit_path = "./MetaHit/Pruned" else: metahit_path = "./MetaHit/Data" # The collection of binding sites to generate the PSSM from. # LexA.seq.fa is a collection of 115 experimentally-determined binding # sites reported in literature. See Table S1 in Cornish et al. (2013). # binding_sites_path = "./LexA.seq.fa" #binding_sites_path= "./LexA_Gamma_collection.fas" binding_sites_path = "./LexA_Grampos_collection.fas" # Number of permutations to run. Note that the COLUMNS of the PSSM permutations = 50 # Scores below this number of bits will not be reported. # Lower values will give more (false-positive) results and also slow down # the execution of the program since more memory needs to be allocated to # store the score values. score_threshold = -50.0 ### Parameters below this line should *probably* not be changed. ### # The background frequency of the bases. An equiprobable frequency # distribution assumes that each base has an equal probability of occuring, # that is: P(A) = P(C) = P(G) = P(T) = 0.25 => GC-content = 0.5 # If set to False, the background frequency will be calculated based on the # nucleotide composition of each patient. # For better comparison across patients, this should be set to True. equiprobable_nuc_freqs = True # The ranges that the scores should be binned into. # Since no scores will be saved below the score_threshold, it serves as a # lower bound to the bins. A good upper bound is around ~30 bits since the # maximum theoretical score a sequence can have from a PSSM is 32. # Under equiprobable frequencies, the LexA consensus sequence has a score # of ~24 bits, so no sequence should score higher than that. bins = range(int(score_threshold), 32, 1) # If you'd like to compare the output of this program by scoring the # E. coli genome, set this to True. # Make sure NC_000913.fna is in the parent directory. # The genome sequence can be download from: # ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/ # Use this for comparison/debugging. score_ecoli_instead = False #Calculate the total number of sites, scaffolds scanned total_num_sites = 0 total_size = 0 total_scaffold = 0 ########## # Find patient files on disk if promoter: metahit_db = glob.glob(metahit_path + "/Pruned_MH[0-9]*.seq.fa") else: metahit_db = glob.glob(metahit_path + "/MH[0-9]*.seq.fa") # metahit_db = ["Eco_300_1_50_P.txt"] if score_ecoli_instead: metahit_db = ["../NC_000913.fna"] # E. coli genome (for debugging) # gather total time start = time.time() # For debugging, truncate to just first patient # An alert just incase I am clumsy and forget these lines are uncommented. #print "USING ONLY ONE PATIENT!!" #metahit_db = ["Eco_300_1_50_P.txt"] #metahit_db = [metahit_db[0]] # Assume equiprobable mononucleotide frequencies mg_frequencies = [0.25] * 4 if not equiprobable_nuc_freqs: # Calculate the background nucleotide frequency for the metagenome mg_frequencies = np.bincount(metagenome).astype(np.float) / metagenome.size # Calculate the original PSSM from binding sites original_pssm = gpu_pssm.create_pssm(binding_sites_path, genome_frequencies = mg_frequencies) # Print the unpermuted PSSM print "Unpermuted PSSM:" mg.print_pssm(original_pssm) #preallocate the array to speed up process permute_pssm = np.empty(shape=(permutations+1,len(original_pssm)), dtype=object) patient_scores = np.zeros(shape=(permutations+1,len(bins)-1)) #calculate predetermined pssm for permutation in range(permutations): # Permute the PSSM permute_pssm[permutation] = mg.permute_pssm(original_pssm) #Cycle through every patient file for patient_file in metahit_db: #Status Update print "File: ", patient_file # Load the sequence into memory metagenome, scaffolds = mg.load_scaffolds(patient_file) total_size += metagenome.size total_scaffold += scaffolds.size print "Genome size:", metagenome.size, "| Scaffolds:", scaffolds.size # Score the metagenome using the original PSSM print "Scoring without permuting..." original_scores,partial_num_sites = score_patient(metagenome, scaffolds, original_pssm, score_threshold, bins) # Keep the distributions of the scores patient_scores[0]+= original_scores total_num_sites += partial_num_sites #For each permutated pssm for permutation in range(permutations): #which permutation it is on print "Permutation %d/%d..." % (permutation + 1, permutations) # Re-score using permuted PSSM perm_scores,partial_num_sites = score_patient(metagenome, scaffolds, permute_pssm[permutation], score_threshold, bins) # Save the distribution of the scores patient_scores[permutation+1] += perm_scores # Plot results plt.figure() for score in patient_scores[1:]: cdf = np.cumsum(score) plt.plot(bins[1:], cdf, "D-r", alpha=0.5, label="Permutation") cdf = np.cumsum(patient_scores[0]) plt.plot(bins[1:], cdf, "D-b", lw=3, label="Original") plt.xlabel("Site score (bits)") plt.ylabel("# of Sites Found") plt.title("Cumulative Density Function") handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles[-2:], labels[-2:], loc="best") plt.grid() plt.figure() for score in patient_scores[1:]: plt.plot(bins[1:], score, "D-r", alpha=0.5, label="Permutation") plt.plot(bins[1:], patient_scores[0], 'D-b', lw=3, label="Original") plt.xlabel("Site score (bits)") plt.ylabel("# of Sites Found ") plt.title("Probability Density Function") handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles[-2:], labels[-2:], loc="best") plt.grid() #Calculate p-values total_fake_patient_scores = patient_scores[1] for score in patient_scores[2:]: total_fake_patient_scores += score total_fake_patient_scores = np.vectorize(lambda x: x if x > 0 else .000001)(np.float64(total_fake_patient_scores)) print real_prob = np.float64(patient_scores[0])/total_num_sites fake_prob = total_fake_patient_scores/(total_num_sites * permutations) #print out the p-values print "Probability (True Matrix | Score):" matrix_prob = (real_prob/(fake_prob+real_prob)) * 100 print "\n".join("%d:%.2f" % (s, p) for s, p in zip(bins,matrix_prob)) #Plot the Probability Values plt.figure() plt.bar(bins[1:],matrix_prob) plt.xlabel("Site score (bits)") plt.ylabel("Confidence Level (%)") plt.grid() #final diagnostics end = time.time() print "Total time: %.2f seconds" % (end-start) print "Total Metagenome Size: %d bp" % (total_size) print "Total Scaffolds Scanned: %d" % (total_scaffold)