예제 #1
0
def score_contig(contig, par, genome, genome_index, test):
    score_obj = ScoreCollection()
    score_gen = Score(mn.log_probability(contig.signature, par), genome, genome, contig.contig_id)
    score_obj.genome = score_gen
    group_genomes = test.group.all_genomes_but_index(genome_index)
    for gen in group_genomes:
        score_obj.group.append(Score(mn.log_probability(contig.signature, gen.par()),genome, gen, contig.contig_id))
    for group in test.rest_groups:
        outside_group = []
        for gen in group.genomes:
            score = Score(mn.log_probability(contig.signature,gen.par()), genome, gen, contig.contig_id)
            outside_group.append(score)
        score_obj.other.append(outside_group)
    return score_obj
예제 #2
0
    def test_log_probability(self):
        dna_c = dna.DNA(id="hej",seq="AAAA")
        dna_c.signature = Counter({0:10,1: 4,2:1,3:8,4:20})
        
        log_p_val = ml.log_probability(dna_c,np.array([0.2,0.1,0.05,0.2,0.45]))
        real_p = np.exp(log_p_val)
        assert_almost_equal(real_p, 0.001074701)

        dna_c = dna.DNA(id="hej",seq="AAAA")
        dna_c.signature = Counter({0:1, 1:0})
        log_p_val = ml.log_probability(dna_c,np.array([0.5,0.5]))
        real_p = np.exp(log_p_val)
        assert_almost_equal(real_p, 0.5)

        dna_c = dna.DNA(id="hej",seq="AAAA")
        dna_c.signature = Counter({0:0, 1:1})
        log_p_val = ml.log_probability(dna_c,np.array([0.5,0.5]))
        real_p = np.exp(log_p_val)
        assert_almost_equal(real_p, 0.5)
def main(contigs_file, taxonomy_file, dir_path, kmer_length, dir_structure, taxonomy_info_in_contigs):

    groups = []
    DNA.generate_kmer_hash(kmer_length)

    contigs = read_contigs_file(contigs_file, taxonomy_info=taxonomy_info_in_contigs)

    # Divide genomes into groups, one for each genus
    meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file)

    # Fetch sequence for each genome
    genomes = read_FASTA_files_no_groups(meta_genomes, dir_path, dir_structure=dir_structure)

    for genome in genomes:
        genome.calculate_signature()
        genome.pseudo_par = mn.fit_nonzero_parameters([genome])

    scores = []
    for contig in contigs:
        contig.calculate_signature()
        for genome in genomes:
            if contig.id == genome.id:
                temp_genome = deepcopy(genome)
                temp_genome.signature.subtract(contig.signature)
                temp_pseudo_par = mn.fit_nonzero_parameters([temp_genome])
                p_val = mn.log_probability(contig, temp_pseudo_par)
            else:
                p_val = mn.log_probability(contig, genome.pseudo_par)
            scores.append(Score(p_val, contig, genome, contig.contig_id, taxonomy_info=taxonomy_info_in_contigs))

    if taxonomy_info_in_contigs:
        sys.stdout.write(
            "p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id"
            + os.linesep
        )
    else:
        sys.stdout.write(
            "p_value\t\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id"
            + os.linesep
        )
    for score in scores:
        sys.stdout.write(str(score) + "\n")
예제 #4
0
 def test_uniform_one_contig_prob(self):
     f = fileinput.input(os.path.join(data_path,"bambus2.scaffold.linear.fasta.one_contig"))
     c = list(SeqIO.parse(f,"fasta"))
     f.close()
     dna_c = dna.DNA(id = c[0].id, seq = str(c[0].seq))
     dna_c.calculate_signature()
     k = 4**4
     uniform_prob = np.ones((dna.DNA.kmer_hash_count))
     for i,cnt in dna_c.signature.items():
         uniform_prob[i] = 1./k
     log_prob = ml.log_probability(dna_c,uniform_prob)
     print log_prob
     assert_almost_equal(log_prob, -3791.05738056)
예제 #5
0
    def test_log_probability(self):
        f = fileinput.input(os.path.join(data_path,"bambus2.scaffold.linear.fasta.one_contig"))
        c = list(SeqIO.parse(f,"fasta"))
        f.close()
        
        dna_c = dna.DNA(id = c[0].id, seq = str(c[0].seq))
        dna_c.calculate_signature()
        k = 4**4
        uniform_prob = np.ones((dna.DNA.kmer_hash_count))
        for i,cnt in dna_c.signature.items():
            uniform_prob[i] = 1./k
        log_prob = ml.log_probability(dna_c,uniform_prob)

        mu = np.log(np.array([0.5,3.0,5.0]))
        sigma = 0.5
        x = np.log(np.array([0.5,3.0,5.0]))
        cov_matrix = np.array([x])

        p_ig = ig.log_pdf(x,mu,sigma)
        p_test = model.log_probability(dna_c,cov_matrix,uniform_prob,mu,sigma)
        assert_equal(p_ig+log_prob,p_test)
예제 #6
0
파일: simple_add.py 프로젝트: BinPro/ProBin
def log_probability(seq,cov_matrix,prob_vector,mu,sigma,factor=1):
    ig_p = isotropic_gaussian.log_pdf(cov_matrix,mu,sigma)
    mu_p = multinomial.log_probability(seq,prob_vector)
    return ig_p*factor+mu_p