def test_log_probability_full(self): file_name = os.path.join(cur_dir,"..","data/generated_contigs_test.fna") f = fileinput.input(file_name) c = list(SeqIO.parse(f,"fasta")) f.close() dna_c1g1 = dna.DNA(id = c[0].id, seq = str(c[0].seq)) dna_c2g1 = dna.DNA(id = c[1].id, seq = str(c[1].seq)) dna_c3g1 = dna.DNA(id = c[2].id, seq = str(c[2].seq)) dna_c1g2 = dna.DNA(id = c[-3].id, seq = str(c[-3].seq)) dna_c2g2 = dna.DNA(id = c[-2].id, seq = str(c[-2].seq)) dna_c3g2 = dna.DNA(id = c[-1].id, seq = str(c[-1].seq)) cluster1 = [dna_c1g1,dna_c2g1,dna_c3g1] cluster2 = [dna_c1g2,dna_c2g2,dna_c3g2] for contig in cluster1 + cluster2: contig.calculate_signature() parameters1 = model.fit_nonzero_parameters(cluster1) parameters2 = model.fit_nonzero_parameters(cluster2) # These testa are probably too shaky, due to the # numerical optimization for finding the parameters s1 = dna_c1g1 log_prob1 = model.log_probability(s1,parameters1) assert_almost_equal(log_prob1/10000.0, -0.450, places = 1) log_prob2 = model.log_probability(s1,parameters2) assert_almost_equal(log_prob2/10000.0,-0.4676, places = 2) s2 = dna_c1g2 log_prob3 = model.log_probability(s2,parameters1) assert_almost_equal(log_prob3/10000.0,-0.517, places = 2) log_prob4 = model.log_probability(s2,parameters2) assert_almost_equal(log_prob4/10000.0,-0.483, places = 2)
def main(contigs_file,taxonomy_file, dir_path, kmer_length, contig_length, algorithm): groups = [] DNA.generate_kmer_hash(kmer_length) contigs = read_contigs_file(contigs_file,start_position=True) # Divide genomes into groups, one for each genus meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file) # Fetch sequence for each genome genomes = read_FASTA_files_no_groups(meta_genomes, dir_path) genome_part_l = 10000 for genome in genomes: genome.calculate_signature() genome.parts = genome.split_seq(genome_part_l) for part in genome.parts: part.calculate_signature() genome.pseudo_par = model.fit_nonzero_parameters(\ genome.parts, algorithm = algorithm) scores = [] for contig in contigs: contig.calculate_signature() for genome in genomes: if contig.id == genome.id: s = int(contig.start_position) start_part_index = s/genome_part_l end_part_index = (s+contig_length)/genome_part_l if start_part_index == end_part_index: i = start_part_index temp_pseudo_par = model.fit_nonzero_parameters(\ genome.parts[0:i]+genome.parts[i+1:], algorithm=algorithm) else: i1 = start_part_index i2 = end_part_index temp_pseudo_par = model.fit_nonzero_parameters(\ genome.parts[0:i1]+genome.parts[i2+1:], algorithm=algorithm) p_val = model.log_probability(\ contig, temp_pseudo_par) else: p_val = model.log_probability(\ contig, genome.pseudo_par) scores.append(\ Score(p_val, contig, genome, contig.contig_id)) sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep) for score in scores: sys.stdout.write(str(score) + '\n')
def test_log_probability_order(self): file_name = os.path.join(cur_dir,"..","data/generated_contigs_test.fna") f = fileinput.input(file_name) c = list(SeqIO.parse(f,"fasta")) f.close() dna_c1g1 = dna.DNA(id = c[0].id, seq = str(c[0].seq)) dna_c2g1 = dna.DNA(id = c[1].id, seq = str(c[1].seq)) dna_c3g1 = dna.DNA(id = c[2].id, seq = str(c[2].seq)) dna_c1g2 = dna.DNA(id = c[-3].id, seq = str(c[-3].seq)) dna_c2g2 = dna.DNA(id = c[-2].id, seq = str(c[-2].seq)) dna_c3g2 = dna.DNA(id = c[-1].id, seq = str(c[-1].seq)) cluster1 = [dna_c1g1,dna_c2g1,dna_c3g1] cluster2 = [dna_c1g2,dna_c2g2,dna_c3g2] for contig in cluster1 + cluster2: contig.calculate_signature() parameters1 = model.fit_nonzero_parameters(cluster1) parameters2 = model.fit_nonzero_parameters(cluster2) log_prob1 = model.log_probability(dna_c1g1,parameters1) log_prob2 = model.log_probability(dna_c1g1,parameters2) assert_equal(log_prob1>log_prob2,True)
def main(contigs_file,taxonomy_file, dir_path, kmer_length, contig_length): groups = [] DNA.generate_kmer_hash(kmer_length) contigs = read_contigs_file(contigs_file,start_position=True) # Divide genomes into groups, one for each genus meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file) # Fetch sequence for each genome genomes = read_FASTA_files_no_groups(meta_genomes, dir_path) genome_part_l = 10000 for genome in genomes: genome.calculate_signature() genome.parts = genome.split_seq(genome_part_l) for part in genome.parts: part.calculate_signature() alpha_fit = model.fit_nonzero_parameters_full_output(\ genome.parts) sys.stderr.write(str(alpha_fit)+'\n') genome.pseudo_par = alpha_fit[0] scores = [] for contig in contigs: contig.calculate_signature() contig.pseudo_counts_array = np.fromiter(contig.pseudo_counts,np.dtype('u4'),DNA.kmer_hash_count).reshape((1,DNA.kmer_hash_count)) for genome in genomes: p_val = model.log_probability(\ contig, genome.pseudo_par, pseudo_counts_supplied=True) scores.append(\ Score(p_val, contig, genome, contig.contig_id)) sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep) for score in scores: sys.stdout.write(str(score) + '\n')