def main(contigs_file,taxonomy_file, dir_path, kmer_length, contig_length, algorithm):

    groups = []
    DNA.generate_kmer_hash(kmer_length)

    contigs = read_contigs_file(contigs_file,start_position=True)
    
    # Divide genomes into groups, one for each genus
    meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file)

    # Fetch sequence for each genome
    genomes = read_FASTA_files_no_groups(meta_genomes, dir_path)

    genome_part_l = 10000
    for genome in genomes:
        genome.calculate_signature()
        genome.parts = genome.split_seq(genome_part_l)
        for part in genome.parts:
            part.calculate_signature()
        genome.pseudo_par = model.fit_nonzero_parameters(\
            genome.parts, algorithm = algorithm)

    scores = []
    for contig in contigs:
        contig.calculate_signature()
        for genome in genomes:
            if contig.id == genome.id:
                s = int(contig.start_position)
                start_part_index = s/genome_part_l
                end_part_index = (s+contig_length)/genome_part_l
                if start_part_index == end_part_index:
                    i = start_part_index
                    temp_pseudo_par = model.fit_nonzero_parameters(\
                        genome.parts[0:i]+genome.parts[i+1:],
                        algorithm=algorithm)
                else:
                    i1 = start_part_index
                    i2 = end_part_index
                    temp_pseudo_par = model.fit_nonzero_parameters(\
                        genome.parts[0:i1]+genome.parts[i2+1:],
                        algorithm=algorithm)

                p_val = model.log_probability(\
                    contig, temp_pseudo_par)
            else:
                p_val = model.log_probability(\
                    contig, genome.pseudo_par)
            scores.append(\
                Score(p_val, contig, genome, contig.contig_id))

    sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep)
    for score in scores:
        sys.stdout.write(str(score) + '\n')
Exemplo n.º 2
0
 def test_read_FASTA_files_no_groups(self):
     cur_dir = os.path.dirname(__file__)
     parsed_file_name = os.path.join(cur_dir,"fixtures/parsed_gen_2_2_test.txt")
     open_file = open(parsed_file_name, 'r')
     meta_genomes = genome_info_from_parsed_taxonomy_file(open_file)
     dir_path = os.path.join(cur_dir,"fixtures/reference_genomes")
     real_genomes = read_FASTA_files_no_groups(meta_genomes, dir_path)
     assert_equal(len(real_genomes),7)
     last_genome = real_genomes[-1]
     assert_equal(len(last_genome.full_seq),2612925)
     assert_equal(last_genome.id, "Capnocytophaga_ochracea_DSM_7271_uid59197")
     # A correct family
     assert_equal(real_genomes[-1].family, "Flavobacteriaceae")
Exemplo n.º 3
0
 def test_read_signle_FASTA_file_no_groups(self):
     cur_dir = os.path.dirname(__file__)
     parsed_file_name = os.path.join(cur_dir,"fixtures/parsed_gen_0_0_mock_test_complete.txt")
     open_file = open(parsed_file_name, 'r')
     meta_genomes = genome_info_from_parsed_taxonomy_file(open_file)
     dir_path = os.path.join(cur_dir,"fixtures/mock_references_test.fa")
     real_genomes = read_FASTA_files_no_groups(meta_genomes, dir_path, dir_structure='single_fasta_file')
     assert_equal(len(real_genomes),15)
     last_genome = real_genomes[-1]
     assert_equal(len(last_genome.full_seq),2222430)
     assert_equal(last_genome.id, "Pyrobaculum_aerophilum_str._IM2")
     # A correct family
     assert_equal(real_genomes[-1].family, "Thermoproteaceae")
def main(contigs_file, taxonomy_file, dir_path, kmer_length, dir_structure, taxonomy_info_in_contigs):

    groups = []
    DNA.generate_kmer_hash(kmer_length)

    contigs = read_contigs_file(contigs_file, taxonomy_info=taxonomy_info_in_contigs)

    # Divide genomes into groups, one for each genus
    meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file)

    # Fetch sequence for each genome
    genomes = read_FASTA_files_no_groups(meta_genomes, dir_path, dir_structure=dir_structure)

    for genome in genomes:
        genome.calculate_signature()
        genome.pseudo_par = mn.fit_nonzero_parameters([genome])

    scores = []
    for contig in contigs:
        contig.calculate_signature()
        for genome in genomes:
            if contig.id == genome.id:
                temp_genome = deepcopy(genome)
                temp_genome.signature.subtract(contig.signature)
                temp_pseudo_par = mn.fit_nonzero_parameters([temp_genome])
                p_val = mn.log_probability(contig, temp_pseudo_par)
            else:
                p_val = mn.log_probability(contig, genome.pseudo_par)
            scores.append(Score(p_val, contig, genome, contig.contig_id, taxonomy_info=taxonomy_info_in_contigs))

    if taxonomy_info_in_contigs:
        sys.stdout.write(
            "p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id"
            + os.linesep
        )
    else:
        sys.stdout.write(
            "p_value\t\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id"
            + os.linesep
        )
    for score in scores:
        sys.stdout.write(str(score) + "\n")
def main(contigs_file,taxonomy_file, dir_path, kmer_length, contig_length):

    groups = []
    DNA.generate_kmer_hash(kmer_length)

    contigs = read_contigs_file(contigs_file,start_position=True)
    
    # Divide genomes into groups, one for each genus
    meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file)

    # Fetch sequence for each genome
    genomes = read_FASTA_files_no_groups(meta_genomes, dir_path)

    genome_part_l = 10000
    for genome in genomes:
        genome.calculate_signature()
        genome.parts = genome.split_seq(genome_part_l)
        for part in genome.parts:
            part.calculate_signature()
        alpha_fit =  model.fit_nonzero_parameters_full_output(\
            genome.parts)
        sys.stderr.write(str(alpha_fit)+'\n')
        genome.pseudo_par = alpha_fit[0]

    scores = []
    for contig in contigs:
        contig.calculate_signature()
        contig.pseudo_counts_array = np.fromiter(contig.pseudo_counts,np.dtype('u4'),DNA.kmer_hash_count).reshape((1,DNA.kmer_hash_count))
        for genome in genomes:
            p_val = model.log_probability(\
                contig, genome.pseudo_par, pseudo_counts_supplied=True)
            scores.append(\
                Score(p_val, contig, genome, contig.contig_id))

    sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep)
    for score in scores:
        sys.stdout.write(str(score) + '\n')
def main(contigs_file,contig_time_series_file, genome_time_series_file, taxonomy_file,dir_path, contig_length, total_read_count,assembly_length,first_data,last_data):

    DNA.generate_kmer_hash(2)

    contigs = read_contigs_file(contigs_file,start_position=True)
    
    contig_time_series_df = read_time_series(contig_time_series_file)

    if len(contigs)!=len(contig_time_series_df.index):
        raise TypeError("The number of contigs and time series does not match")
    
    for contig in contigs:
        contig.mapping_reads = contig_time_series_df[contig_time_series_df.contig_id == contig.contig_id]

    # Divide genomes into groups, one for each genus
    meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file)

    # Fetch sequence for each genome
    genomes = read_FASTA_files_no_groups(meta_genomes, dir_path)

    # Fetch time series for each genome
    read_time_series_file_genomes(genomes, genome_time_series_file)

    for genome in genomes:
        genome.pseudo_par = model.fit_nonzero_parameters([genome],total_read_count)

    scores = []
    for contig in contigs:
        for genome in genomes:
            p_val = model.log_probability(\
                    contig, genome.pseudo_par, total_read_count,assembly_length)
            scores.append(\
                Score(p_val, contig, genome, contig.contig_id))

    sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep)
    for score in scores:
        sys.stdout.write(str(score) + '\n')