示例#1
0
def main():
    motif_filename = "Gamma_collection.txt"
    genome_filename = "NC_000913.fna" # Escherichia coli str. K-12 substr. MG1655
    top_sites = ["TACTGTATAAATAAACAGTA", "TACTGTATATAAAAACAGTA", "TACTGTATATAAAAACAGTA", "AACTGTATATAAATACAGTT", "TACTGTATATAAAACCAGTT", "TACTGGATAAAAAAACAGTT", "TACTGTATAAAATCACAGTT", "AACTGGATAATCATACAGTA", "CACTGTATAAATAAACAGCT", "TACTGTACATCCATACAGTA", "ACCTGTATAAATAACCAGTA", "TACTGTATGAGCATACAGTA", "TACTGGATATTTAAACAGGT", "AACTGTATATACACCCAGGG", "TGCTGTATATACTCACAGCA", "AACTGGATAAAATTACAGGG", "TACTGTATATTCATTCAGGT", "TACTGTACACAATAACAGTA", "AGCTGAATAAATATACAGCA", "ATCTGTATATATACCCAGCT", "CACTGGATAGATAACCAGCA", "TACTGCATATACAACCAGAA", "CACTGTATACTTTACCAGTG", "AACTGTCGATACGTACAGTA", "TGCTGTACAAACGTCCAGTT", "CGCTGGATATCTATCCAGCA", "TGCTGTTTATTAAACCAGAA", "CATTGTTTATAAAAACAGCA", "TACTGGAGACAAATACAGCT", "ACCTGTATATATCATCAGTA", "TACTGTATAAACAGCCAATA", "TACTGTTTATCTTCCCAGCG", "GACTGTATAAAACCACAGCC", "TATTGTATATATTCACATTA", "AAGTGTATTTACACACAGCG", "AACTGGATAATCATACCGTT", "TGCTGTATGGATTAACAGGA", "TCCTGTATGAAAAACCATTA", "GCCTGTCTGAACAAACAGTA", "AAGTGTATATATATCCATCG", "AATTGTTTAAAAAACCAGAA", "TATTGTATTTATAAACATTA", "AACTGATTAAAAACCCAGCG", "GAGTGTATATAAAGCCAGAA", "TTCTGGATAAGCATCCAGAA", "ACCTGAATATTCAAACAGCG", "TACTGTCTACCAAAACAGAG", "TCCTGTATAAATTAACCGTT", "TAATGAATATAAAACCAGGA", "AACTGTAAATAATTACATGA", "TACTGAATAAAAAAGCAGAA", "TGCTGTACAATCAGCCAGCA", "TGCTGGATTTACGACCAGAA", "TACTGTTTATAAACCGAGCG", "TCTTGTATATCCAACCAGTT", "CACTGTATTAAAAACCATTC", "AACTGTATTACCTTCCAGCC", "TACTGTTTCCATTTACAGCC", "TAGTGGATGTAAAAACATTT", "CACTGTCTATACTTACATGT", "TACTGTTTGTGCAAATAGTA", "AACTGGTTATCAACCCAGAC", "CTCTGTATCTAATTACAGGT", "CACTGAATGCTAAAACAGCA", "TGCTGGATGTGAAACCAGCG", "TGCTGATTAAAAAACCAGCG", "AACTGGATATCTATCCGGAA", "CACTGTCTGATACAACAGTT", "AATTGTTAATATATCCAGAA", "AACTTTATGTACAGCCAGTG", "AACTGGTTATTCCACCAGAA", "GAATGGATAAAAAAACAGCC", "AGCTGTATAAAAATCCTGAG", "TCCTGGCTATTTTGCCAGTA", "AGCTGGCTATCTGAACAGTT", "CACTGGATATTCCTTCAGGT", "GGCTGGATAAAGAACCAGAA", "AACTGAATAAAAACAAAGGA", "TGCTGTACATCAGCACAGAT", "CAGTGGATTAACTTCCAGTT", "TACTGGATACAAAAACGGAT", "AACTATATAAATAAACATAA", "TGCTGCATGAAGAAACAGTA", "GTCTGAATGAATACCCAGTA", "TACTTTATTTACTCCCAGTG", "TACTGGAAACTAACACAGGC", "TACTGGATATCAAACCTGAA", "TACTGTCGGAAAAATCAGTG", "AACTGTATATGTCGCCAGGC", "TACTCTTCATTAAAACAGTG", "GACTGGCTTAATACACAGCC", "CAGTGTAAGAATAGACAGTG", "CGCTGGATGAGCGTACAGCA", "AACGGGATAATAAAACAGCC", "TACCGTTTGTATTTCCAGCT", "CTCTGGACATTAAACCAGGA", "TAGTTTATATAAATTCAGTC", "TACTGTATATTCCTCAAGCG", "AACTGGATATATAAATAATG", "AACGGTTTAAACACCCAGCG", "GCCTGGTTAAATGACCAGCA", "TACTGTTTAGCCAGCCAGTC", "TGCTGAACATTCTTCCAGCA", "TACTGGAAATAAGATCAGCC", "AACTGGATCTTAAAAAAGTA", "GGCTGGACAATTTTACAGCT", "AACTGTCTCTTATGACAGTT", "AACTGGATACAGAAACAATA", "TAGTGTTCAGTTTTACAGTA", "TAGTGTTTGTTCATCCATTA", "CAGTGTTGAAAAGTCCAGTA", "ATCTGGCTGAAATTACAGAA", "GATTGAATGAATATACAGGG", "CACTGGTTTTCCACCCAGCA", "AACTGGTTGAAAAACCATCA", "AACTGGTTTAACTCCCAGGG", "AGCTGGATAAACAAAAAGCG", "AACTGGATAAATTACCGGAT", "AACTGGATTTAAATCCTGGT", "AACTGTATTTACAATCATCT", "TACTCTCTGTTCATCCAGCA", "TACTCTATGCAATAACAGAA", "TGCTGAATGCATTAACAGCG", "CACTGGTTATCTTTACCGTA", "TGGTGGATATTTTTTCAGGA", "AACTGGATATCAATCCGGAT", "AAGTGTAAAATTCTACAGAA", "AACCGGACATAAAGCCAGTA", "CACTGTATAAAAATCCTATA", "AACTCTATATTACCCCAGTT", "ACCAGAATAAACATCCAGTA", "TACTGGATGCATTACCGGTA", "CACTCTTTATAAAACCAGGC", "CAGTGAAAATAAAAACAGGA", "AACGGTTTATCTAGCCAGTA", "TCATGTATGATCATACAGAC", "TGCTGAATATATAAAAAGAG", "ACCTGGATGTCACCACAGTT", "TACTAAATGAAAAAACAGCG", "AACTGGATGAACAACCGGCG", "TTCTTTATACATATCCAGCG", "AAGTGTTTGCTAACACAGCA", "AAGTGTAAAAAATCCCAGCG", "AACTGGATAAAGACACCGCT", "TGCTGTATGGTAAATCAGAA", "AACTGTATGATTTAAAAGAT", "TACGGTATAAAAAGACCGTA", "TGCTGGATATTATCCCATCA", "TACTGTCTGAAGAAGCAGTG", "AACTGAATAAATACCCCGGT", "TGCTGTATGAGTAACCGGTA", "CACTGGAAAAATGCGCAGTA", "AACTGGATAGCTATGCAGAA", "AATTGTAAAAAACAACAGCA", "AACTGTTTATCAACACCGCT", "ACCTGTACCTTAAACCAGGA", "TACTTTATAGTTTCCCAGTT", "ATCTGCATAAAGAACCAGTA", "CCCTCTTTATATTTCCAGTG", "TACTGTTTATTAATGTAGCA", "ATGTGAATGAATATCCAGTT", "AACTGGTTAAAATTAGAGAT", "TACTGTAAGAAAAACCCGCA", "ACCTGGAGAAAGAAACAGCG", "CACTGTTTACCCTGACAGTC", "AACTGGCTCATAACCCAGAA"]
    top_seqs = [nt2int(seq) for seq in top_sites]
    top_seqs = top_seqs[0:3] # Truncate to first three for the sake of less output spam
    
    # Load genome (validated)
    genome, __ = load_scaffolds(genome_filename)
    _genome, __ = metagenomics.load_scaffolds(genome_filename)
    print "Genomes loaded are the same:", np.array_equal(genome, _genome)
    print

    # Ensure the sites are in the genome (validated)
    print "Finding the sites in the genome:"
    positions = []
    sites = []
    strands = []
    for seq in top_seqs:
        matches = np.where(np.all(sliding_window(genome, seq.size) == seq, axis=1))
        matches_r = np.where(np.all(sliding_window(genome, seq.size) == wc(seq), axis=1))
        
        # forward strand
        for match in matches:
            if match.size > 0 and match[0] not in positions:
                pos = match[0]
                seq = genome[pos:pos + 20]
                print "Found:", int2nt(seq), "->", match[0] 
                positions.append(pos)
                sites.append(seq)
                strands.append(0) # forward
        
        # reverse strand
        for match in matches_r:
            if match.size > 0 and match[0] not in positions:
                pos = match[0]
                seq = wc(genome[match[0]:match[0] + 20])
                print "Found:", int2nt(seq), "<-", match[0]
                positions.append(pos)
                sites.append(seq)
                strands.append(1) # reverse
    print
    
    # Load PSSM (validated)
    genome_frequencies = np.bincount(genome).astype(np.float) / genome.size
    _pssm = create_pssm(motif_filename, genome_frequencies=genome_frequencies)
    _pssm2 = gpu_pssm.create_pssm(motif_filename, genome_frequencies=genome_frequencies)
    print "PSSMs are the same:", np.array_equal(_pssm, _pssm2)
    print
    
    # Score sites (validated!!)
    print "Scoring sites (method score seq strand pos):"
    for n, site in enumerate(sites):
        print "True:", score_site(site, _pssm), int2nt(site), strands[n], positions[n]
        print "CPU: ", cpu_score_site(site, _pssm), int2nt(site), strands[n], positions[n]
        print "GPU: ", gpu_score_site(site, _pssm), int2nt(site), strands[n], positions[n]
        print
def main():
    
    ##### Parameters #####
    #Want to run with promoter regions
    promoter = True
    
    # Path to the MetaHit database.
    # The patient files can be downloaded from:
    #   http://www.bork.embl.de/~arumugam/Qin_et_al_2010/
    # Make sure these are extracted from their packages! (i.e.: "gunzip *.gz")
    # Note: After extraction, the 84 patient files occupy a total of 6.56 GB on
    # disk!
    # The original paper can be found at:
    #   http://www.nature.com/nature/journal/v464/n7285/full/nature08821.html
    if promoter:
        metahit_path = "./MetaHit/Pruned"
    else:
        metahit_path = "./MetaHit/Data"
    
    # The collection of binding sites to generate the PSSM from.
    # LexA.seq.fa is a collection of 115 experimentally-determined binding
    # sites reported in literature. See Table S1 in Cornish et al. (2013).
    # binding_sites_path = "./LexA.seq.fa"
    #binding_sites_path= "./LexA_Gamma_collection.fas"
    binding_sites_path = "./LexA_Grampos_collection.fas"
    
    # Number of permutations to run. Note that the COLUMNS of the PSSM
    permutations = 50
    
    # Scores below this number of bits will not be reported.
    # Lower values will give more (false-positive) results and also slow down
    # the execution of the program since more memory needs to be allocated to
    # store the score values.
    score_threshold = -50.0
    
    ### Parameters below this line should *probably* not be changed. ###
    
    # The background frequency of the bases. An equiprobable frequency
    # distribution assumes that each base has an equal probability of occuring,
    # that is: P(A) = P(C) = P(G) = P(T) = 0.25 => GC-content = 0.5
    # If set to False, the background frequency will be calculated based on the
    # nucleotide composition of each patient.
    # For better comparison across patients, this should be set to True.
    equiprobable_nuc_freqs = True
    
    # The ranges that the scores should be binned into.
    # Since no scores will be saved below the score_threshold, it serves as a
    # lower bound to the bins. A good upper bound is around ~30 bits since the
    # maximum theoretical score a sequence can have from a PSSM is 32.
    # Under equiprobable frequencies, the LexA consensus sequence has a score
    # of ~24 bits, so no sequence should score higher than that.
    bins = range(int(score_threshold), 32, 1)
    
    # If you'd like to compare the output of this program by scoring the
    # E. coli genome, set this to True.
    # Make sure NC_000913.fna is in the parent directory.
    # The genome sequence can be download from:
    #  ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/
    # Use this for comparison/debugging.
    score_ecoli_instead = False			
    
    #Calculate the total number of sites, scaffolds scanned
    total_num_sites = 0
    total_size = 0
    total_scaffold = 0
    ##########

    # Find patient files on disk
    if promoter:
        metahit_db = glob.glob(metahit_path + "/Pruned_MH[0-9]*.seq.fa")
    else:
        metahit_db = glob.glob(metahit_path + "/MH[0-9]*.seq.fa")
       #  metahit_db = ["Eco_300_1_50_P.txt"]
         
    if score_ecoli_instead:
        metahit_db = ["../NC_000913.fna"] # E. coli genome (for debugging)
    
    # gather total time
    start = time.time()
				
    # For debugging, truncate to just first patient
    # An alert just incase I am clumsy and forget these lines are uncommented.
    #print "USING ONLY ONE PATIENT!!"
    #metahit_db = ["Eco_300_1_50_P.txt"]
    #metahit_db = [metahit_db[0]]
    # Assume equiprobable mononucleotide frequencies
    mg_frequencies = [0.25] * 4
        
    if not equiprobable_nuc_freqs:
        # Calculate the background nucleotide frequency for the metagenome
        mg_frequencies = np.bincount(metagenome).astype(np.float) / metagenome.size
        
    # Calculate the original PSSM from binding sites
    original_pssm = gpu_pssm.create_pssm(binding_sites_path, genome_frequencies = mg_frequencies)
        
    # Print the unpermuted PSSM        
    print "Unpermuted PSSM:"
    mg.print_pssm(original_pssm)

    #preallocate the array to speed up process
    permute_pssm = np.empty(shape=(permutations+1,len(original_pssm)), dtype=object)
    patient_scores = np.zeros(shape=(permutations+1,len(bins)-1))
	
    #calculate predetermined pssm
    for permutation in range(permutations):	
       
        # Permute the PSSM
        permute_pssm[permutation] = mg.permute_pssm(original_pssm)

    #Cycle through every patient file
    for patient_file in metahit_db:

        #Status Update
        print "File: ", patient_file

        # Load the sequence into memory
        metagenome, scaffolds = mg.load_scaffolds(patient_file)
        total_size += metagenome.size
        total_scaffold += scaffolds.size
        print "Genome size:", metagenome.size, "| Scaffolds:", scaffolds.size
           
        # Score the metagenome using the original PSSM
        print "Scoring without permuting..."
        original_scores,partial_num_sites = score_patient(metagenome, scaffolds, original_pssm, score_threshold, bins)

        # Keep the distributions of the scores
        patient_scores[0]+= original_scores
        total_num_sites += partial_num_sites

        #For each permutated pssm
        for permutation in range(permutations):

            #which permutation it is on
            print "Permutation %d/%d..." % (permutation + 1, permutations)            
            
            # Re-score using permuted PSSM
            perm_scores,partial_num_sites = score_patient(metagenome, scaffolds, permute_pssm[permutation], score_threshold, bins)            
            
            # Save the distribution of the scores
            patient_scores[permutation+1] += perm_scores

    # Plot results
    plt.figure()
    for score in patient_scores[1:]:
        cdf = np.cumsum(score)
        plt.plot(bins[1:], cdf, "D-r", alpha=0.5, label="Permutation")
    
    cdf = np.cumsum(patient_scores[0])
    plt.plot(bins[1:], cdf, "D-b", lw=3, label="Original")
    plt.xlabel("Site score (bits)")
    plt.ylabel("# of Sites Found")
    plt.title("Cumulative Density Function")
    handles, labels = plt.gca().get_legend_handles_labels()
    plt.legend(handles[-2:], labels[-2:], loc="best")
    plt.grid()
    
    plt.figure()
    
    for score in patient_scores[1:]:
        plt.plot(bins[1:], score, "D-r", alpha=0.5, label="Permutation")
    plt.plot(bins[1:], patient_scores[0], 'D-b', lw=3, label="Original")
    plt.xlabel("Site score (bits)")
    plt.ylabel("# of Sites Found ")
    plt.title("Probability Density Function")
    handles, labels = plt.gca().get_legend_handles_labels()
    plt.legend(handles[-2:], labels[-2:], loc="best")
    plt.grid()

    #Calculate p-values
    total_fake_patient_scores = patient_scores[1]
    for score in patient_scores[2:]:
	   total_fake_patient_scores += score
    total_fake_patient_scores = np.vectorize(lambda x: x if x > 0 else .000001)(np.float64(total_fake_patient_scores))

    print
    real_prob = np.float64(patient_scores[0])/total_num_sites
    fake_prob = total_fake_patient_scores/(total_num_sites * permutations)

    #print out the p-values
    print "Probability (True Matrix | Score):"
    matrix_prob = (real_prob/(fake_prob+real_prob)) * 100
    print "\n".join("%d:%.2f" % (s, p) for s, p in zip(bins,matrix_prob))

    #Plot the Probability Values
    plt.figure()
    plt.bar(bins[1:],matrix_prob)
    plt.xlabel("Site score (bits)")
    plt.ylabel("Confidence Level (%)")
    plt.grid()
    
    #final diagnostics
    end = time.time()
    print "Total time: %.2f seconds" % (end-start)	
    print "Total Metagenome Size: %d bp" % (total_size)
    print "Total Scaffolds Scanned: %d" % (total_scaffold)