Пример #1
0
def GTDB_for_BLCA(args):

    GTDB_db_file = args['GTDB_ssu']

    GTDB_db_file_path, GTDB_db_file_basename, GTDB_db_file_ext = sep_path_basename_ext(GTDB_db_file)
    file_out_sequence = '%s/%s_BLCAparsed.fasta'    % (GTDB_db_file_path, GTDB_db_file_basename)
    file_out_taxonomy = '%s/%s_BLCAparsed.taxonomy' % (GTDB_db_file_path, GTDB_db_file_basename)

    rank_list = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

    file_out_sequence_handle = open(file_out_sequence, 'w')
    file_out_taxonomy_handle = open(file_out_taxonomy, 'w')
    for seq_record in SeqIO.parse(GTDB_db_file, 'fasta'):
        seq_record_taxon_split = ' '.join(seq_record.description.strip().split(' [')[0].split(' ')[1:]).split(';')
        seq_record_taxon_split_no_rank = [i[3:] for i in seq_record_taxon_split]
        seq_record_taxon_split_no_rank_reverse = seq_record_taxon_split_no_rank[::-1]

        GTDB_seq_taxon_str = ''
        n = 0
        for taxon_rank in rank_list:
            GTDB_seq_taxon_str += '%s:%s;' % (taxon_rank, seq_record_taxon_split_no_rank_reverse[n])
            n += 1

        # write out to taxonomy file
        file_out_taxonomy_handle.write('%s\t%s\n' % (seq_record.id, GTDB_seq_taxon_str))

        # write out to sequence file
        file_out_sequence_handle.write('>%s\n' % seq_record.id)
        file_out_sequence_handle.write('%s\n' % seq_record.seq)

    file_out_sequence_handle.close()
    file_out_taxonomy_handle.close()
Пример #2
0
def reads2bam(args):

    output_prefix   = args['p']
    ref_seq         = args['ref']
    index_ref       = args['index_ref']
    r1_seq          = args['r1']
    r2_seq          = args['r2']
    unpaired_seq    = args['u']
    fq_format       = args['fastq']
    thread_num      = args['t']
    keep_tmp        = args['tmp']


    ref_path, ref_basename, ref_ext = sep_path_basename_ext(ref_seq)

    cmd_bowtie2_build   = 'bowtie2-build -f %s %s --threads %s' % (ref_seq, ref_basename, thread_num)

    cmd_bowtie2 = ''
    if (r1_seq is not None) and (r2_seq is not None) and (unpaired_seq is None):
        cmd_bowtie2     = 'bowtie2 -x %s -1 %s -2 %s -S %s.sam -p %s -f' % (ref_basename, r1_seq, r2_seq, output_prefix, thread_num)
        if fq_format is True:
            cmd_bowtie2 = 'bowtie2 -x %s -1 %s -2 %s -S %s.sam -p %s -q' % (ref_basename, r1_seq, r2_seq, output_prefix, thread_num)

    elif (r1_seq is not None) and (r2_seq is not None) and (unpaired_seq is not None):
        cmd_bowtie2     = 'bowtie2 -x %s -1 %s -2 %s -U %s -S %s.sam -p %s -f' % (ref_basename, r1_seq, r2_seq, unpaired_seq, output_prefix, thread_num)
        if fq_format is True:
            cmd_bowtie2 = 'bowtie2 -x %s -1 %s -2 %s -U %s -S %s.sam -p %s -q' % (ref_basename, r1_seq, r2_seq, unpaired_seq, output_prefix, thread_num)

    elif (r1_seq is None) and (r2_seq is None) and (unpaired_seq is not None):
        cmd_bowtie2     = 'bowtie2 -x %s -U %s -S %s.sam -p %s -f' % (ref_basename, unpaired_seq, output_prefix, thread_num)
        if fq_format is True:
            cmd_bowtie2 = 'bowtie2 -x %s -U %s -S %s.sam -p %s -q' % (ref_basename, unpaired_seq, output_prefix, thread_num)
    else:
        print('Please check your input reads files')
        exit()

    cmd_samtools_view   = 'samtools view -bS %s.sam -o %s.bam' % (output_prefix, output_prefix)
    cmd_samtools_sort   = 'samtools sort %s.bam -o %s_sorted.bam' % (output_prefix, output_prefix)
    cmd_samtools_index  = 'samtools index %s_sorted.bam' % output_prefix

    if index_ref is True:
        os.system(cmd_bowtie2_build)
    os.system(cmd_bowtie2)
    os.system(cmd_samtools_view)
    os.system(cmd_samtools_sort)
    os.system(cmd_samtools_index)

    if keep_tmp is False:
        os.system('rm %s.sam' % output_prefix)
        os.system('rm %s.bam' % output_prefix)
Пример #3
0
def sam2bam(args):

    sam_in = args['sam']

    sam_path, sam_basename, sam_ext = sep_path_basename_ext(sam_in)

    pwd_bam = '%s/%s.bam' % (sam_path, sam_basename)
    pwd_bam_sorted = '%s/%s_sorted.bam' % (sam_path, sam_basename)

    cmd_samtools_view = 'samtools view -bS %s -o %s' % (sam_in, pwd_bam)
    cmd_samtools_sort = 'samtools sort %s -o %s' % (pwd_bam, pwd_bam_sorted)
    cmd_samtools_index = 'samtools index %s' % pwd_bam_sorted

    os.system(cmd_samtools_view)
    os.system(cmd_samtools_sort)
    os.system(cmd_samtools_index)

    os.system('rm %s' % pwd_bam)
Пример #4
0
def download_GenBank_genome(args):

    csv_file = args['csv']
    get_fna = args['fna']
    get_faa = args['faa']
    get_gbff = args['gbff']
    with_name = args['name']
    num_threads = args['t']

    time_format = '[%Y-%m-%d %H:%M:%S] '

    if (get_fna is False) and (get_faa is False) and (get_gbff is False):
        print(
            datetime.now().strftime(time_format) +
            'Please specify at least one file type to download, program exited'
        )
        exit()

    in_file_path, in_file_basename, in_file_extension = sep_path_basename_ext(
        csv_file)
    downloaded_genome_folder = '%s_genomes' % in_file_basename
    force_create_folder(downloaded_genome_folder)

    # report
    print(datetime.now().strftime(time_format) +
          'Downloading genomes with %s cores' % (num_threads))

    # download genome with multiprocessing
    list_for_multiple_arguments_download_worker = []
    for genome_record in open(csv_file):

        if not genome_record.startswith('#Organism Name'):
            genome_record_split = genome_record.strip().split(',')
            list_for_multiple_arguments_download_worker.append([
                genome_record_split, downloaded_genome_folder, get_fna,
                get_faa, get_gbff, with_name
            ])

    # run COG annotaion files with multiprocessing
    pool = mp.Pool(processes=num_threads)
    pool.map(genome_download_worker,
             list_for_multiple_arguments_download_worker)
    pool.close()
    pool.join()
Пример #5
0
def run_blast_worker(argument_list):

    pwd_input_file = argument_list[0]
    run_blast = argument_list[1]
    run_diamond = argument_list[2]
    KEGG_DB_seq = argument_list[3]
    KEGG_DB_seq_diamond = argument_list[4]
    op_dir = argument_list[5]
    evalue_cutoff = argument_list[6]
    threads_num = argument_list[7]

    ################################################### define file name ###################################################

    input_file_path, in_file_basename, input_file_ext = sep_path_basename_ext(
        pwd_input_file)

    blast_results = '%s/%s_KEGG_wd/%s_blast.tab' % (op_dir, in_file_basename,
                                                    in_file_basename)
    blast_results_best_hit = '%s/%s_KEGG_wd/%s_blast_best_hits.tab' % (
        op_dir, in_file_basename, in_file_basename)

    # create output folder
    force_create_folder('%s/%s_KEGG_wd' % (op_dir, in_file_basename))

    ########################################## blast against KEGG database (Shan) ##########################################

    if run_blast is True:

        if run_diamond is False:
            blastp_cmd = 'blastp -query %s -db %s -out %s -outfmt 6 -evalue %s -num_alignments 10 -num_threads %s' % (
                pwd_input_file, KEGG_DB_seq, blast_results, evalue_cutoff,
                threads_num)
            os.system(blastp_cmd)

        else:
            diamond_cmd = 'diamond blastp -q %s --db %s --out %s --outfmt 6 --evalue %s --block-size 1 --threads %s --quiet' % (
                pwd_input_file, KEGG_DB_seq_diamond, blast_results,
                evalue_cutoff, threads_num)
            os.system(diamond_cmd)

        # only keep the best hit
        keep_blast_hit_with_highest_bit_score(blast_results,
                                              blast_results_best_hit)
Пример #6
0
def dbCAN_worker(argument_list):

    pwd_input_file = argument_list[0]
    pwd_hmmscan_parser = argument_list[1]
    pwd_dbCAN_fam_HMMs = argument_list[2]
    sequence_type = argument_list[3]
    output_folder = argument_list[4]
    fam_to_activities_dict = argument_list[5]
    depth_file = argument_list[6]

    input_seq_path, input_seq_no_ext, input_seq_ext = sep_path_basename_ext(
        pwd_input_file)
    current_output_folder = '%s/%s_dbCAN_wd' % (output_folder,
                                                input_seq_no_ext)

    force_create_folder(current_output_folder)

    input_seq_aa = ''
    if (sequence_type == 'N') or (sequence_type == 'n'):
        input_seq_aa = '%s/%s_aa.fasta' % (current_output_folder,
                                           input_seq_no_ext)
        dna2aa(pwd_input_file, input_seq_aa)
    elif (sequence_type == 'P') or (sequence_type == 'p'):
        input_seq_aa = pwd_input_file
    else:
        print('Specified input sequence type unrecognizable, program exited!')
        exit()

    hmmscan_cmd = "hmmscan --domtblout %s/%s.out.dm %s %s > %s/%s.out" % (
        current_output_folder, input_seq_no_ext, pwd_dbCAN_fam_HMMs,
        input_seq_aa, current_output_folder, input_seq_no_ext)

    hmmscan_parser_cmd = "sh %s %s/%s.out.dm > %s/%s.out.dm.ps" % (
        pwd_hmmscan_parser, current_output_folder, input_seq_no_ext,
        current_output_folder, input_seq_no_ext)

    final_cat_cmd = "cat %s/%s.out.dm.ps | awk '$5<1e-18&&$10>0.35' > %s/%s.out.dm.ps.stringent" % (
        current_output_folder, input_seq_no_ext, current_output_folder,
        input_seq_no_ext)

    os.system(hmmscan_cmd)
    os.system(hmmscan_parser_cmd)
    os.system(final_cat_cmd)

    ################################### get functional descriptions for query genes ####################################

    pwd_annotation_results = '%s/%s_dbCAN.txt' % (current_output_folder,
                                                  input_seq_no_ext)
    pwd_annotation_results_stats_GeneNumber = '%s/%s_dbCAN_stats_GeneNumber.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_annotation_results_stats_GeneNumber_pct = '%s/%s_dbCAN_stats_GeneNumber_pct.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_annotation_results_stats_TotalDepth = '%s/%s_dbCAN_stats_TotalDepth.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_annotation_results_stats_TotalDepth_pct = '%s/%s_dbCAN_stats_TotalDepth_pct.txt' % (
        current_output_folder, input_seq_no_ext)

    # read in depth info
    gene_depth_dict = {}
    if depth_file is not None:
        for each_depth in open(depth_file):
            each_depth_split = each_depth.strip().split('\t')
            gene_depth_dict[each_depth_split[0]] = float(each_depth_split[1])

    # get all sequences in input seq file
    query_seq_list = []
    for query_seq in SeqIO.parse(pwd_input_file, 'fasta'):
        query_seq_list.append(query_seq.id)

    # get total number and depth of all genes in one file
    total_depth_for_all_query_genes = 0
    if depth_file is not None:
        for gene in query_seq_list:
            gene_depth = gene_depth_dict[gene]
            total_depth_for_all_query_genes += gene_depth

    # parse hmmscan results
    pwd_annotation_results_handle = open(pwd_annotation_results, 'w')
    pwd_annotation_results_handle.write('Query\tFamily\tActivities\n')
    hmm_to_gene_member_dict = {}
    for hmm_hit in open('%s/%s.out.dm.ps.stringent' %
                        (current_output_folder, input_seq_no_ext)):
        hmm_hit_split = hmm_hit.strip().split('\t')
        query_id = hmm_hit_split[2]
        matched_hmm = hmm_hit_split[0]
        matched_hmm_id = matched_hmm.split('.hmm')[0]

        # get activities
        matched_hmm_activities = 'NA'
        matched_hmm_id_no_underscore = matched_hmm_id
        if '_' in matched_hmm_id_no_underscore:
            matched_hmm_id_no_underscore = matched_hmm_id_no_underscore.split(
                '_')[0]
        if matched_hmm_id_no_underscore in fam_to_activities_dict:
            matched_hmm_activities = fam_to_activities_dict[
                matched_hmm_id_no_underscore]

        # get hmm_to_num_dict
        if matched_hmm_id not in hmm_to_gene_member_dict:
            hmm_to_gene_member_dict[matched_hmm_id] = [query_id]
        else:
            hmm_to_gene_member_dict[matched_hmm_id].append(query_id)

        # write out
        pwd_annotation_results_handle.write(
            '%s\t%s\t%s\n' % (query_id, matched_hmm, matched_hmm_activities))

    pwd_annotation_results_handle.close()

    #################### get summary of annotation results GeneNumber ####################

    pwd_annotation_results_stats_GeneNumber_handle = open(
        pwd_annotation_results_stats_GeneNumber, 'w')
    pwd_annotation_results_stats_GeneNumber_handle.write(
        'Family\tGeneNumber\tActivities\n')
    total_GeneNumber_identified = 0
    for each_hmm in hmm_to_gene_member_dict:
        each_hmm_id = each_hmm.split('.hmm')[0]
        each_hmm_GeneNumber = len(hmm_to_gene_member_dict[each_hmm_id])

        each_hmm_activities = 'NA'
        matched_hmm_id_no_underscore = each_hmm_id
        if '_' in matched_hmm_id_no_underscore:
            matched_hmm_id_no_underscore = matched_hmm_id_no_underscore.split(
                '_')[0]
        if matched_hmm_id_no_underscore in fam_to_activities_dict:
            each_hmm_activities = fam_to_activities_dict[
                matched_hmm_id_no_underscore]

        pwd_annotation_results_stats_GeneNumber_handle.write(
            '%s\t%s\t%s\n' %
            (each_hmm_id, each_hmm_GeneNumber, each_hmm_activities))
        total_GeneNumber_identified += each_hmm_GeneNumber
    pwd_annotation_results_stats_GeneNumber_handle.close()

    #################### get summary of annotation results GeneNumber pct ####################

    AnnotateNorm(file_in=pwd_annotation_results_stats_GeneNumber,
                 skip_header=True,
                 value_column=2,
                 Divisor_value=total_GeneNumber_identified,
                 file_out=pwd_annotation_results_stats_GeneNumber_pct,
                 file_out_header='Family\tGeneNumber_pct\tActivities\n')

    #################### get summary of annotation results TotalDepth ####################

    if depth_file is not None:
        pwd_annotation_results_stats_TotalDepth_handle = open(
            pwd_annotation_results_stats_TotalDepth, 'w')
        pwd_annotation_results_stats_TotalDepth_handle.write(
            'Family\tTotalDepth\tActivities\n')
        total_depth_identified = 0
        for each_hmm in hmm_to_gene_member_dict:
            each_hmm_id = each_hmm.split('.hmm')[0]
            each_hmm_TotalDepth = 0
            for each_gene in hmm_to_gene_member_dict[each_hmm_id]:
                each_gene_depth = gene_depth_dict[each_gene]
                each_hmm_TotalDepth += each_gene_depth
            each_hmm_TotalDepth = float("{0:.2f}".format(each_hmm_TotalDepth))

            each_hmm_activities = 'NA'
            matched_hmm_id_no_underscore = each_hmm_id
            if '_' in matched_hmm_id_no_underscore:
                matched_hmm_id_no_underscore = matched_hmm_id_no_underscore.split(
                    '_')[0]
            if matched_hmm_id_no_underscore in fam_to_activities_dict:
                each_hmm_activities = fam_to_activities_dict[
                    matched_hmm_id_no_underscore]

            pwd_annotation_results_stats_TotalDepth_handle.write(
                '%s\t%s\t%s\n' %
                (each_hmm_id, each_hmm_TotalDepth, each_hmm_activities))
            total_depth_identified += each_hmm_TotalDepth

        pwd_annotation_results_stats_TotalDepth_handle.close()

        #################### get summary of annotation results TotalDepth pct ####################

        AnnotateNorm(file_in=pwd_annotation_results_stats_TotalDepth,
                     skip_header=True,
                     value_column=2,
                     Divisor_value=total_depth_identified,
                     file_out=pwd_annotation_results_stats_TotalDepth_pct,
                     file_out_header='Family\tTotalDepth_pct\tActivities\n')
Пример #7
0
def parse_blast_op_worker(argument_list):

    pwd_input_file = argument_list[0]
    run_blast = argument_list[1]
    As_description_dict = argument_list[2]
    Bs_description_dict = argument_list[3]
    Cs_description_dict = argument_list[4]
    Ds_description_dict = argument_list[5]
    D2ABCD_dict = argument_list[6]
    db_seq_to_KO_dict = argument_list[7]
    op_dir = argument_list[8]
    depth_file = argument_list[9]
    pct_by_all = argument_list[10]

    ################################################### define file name ###################################################

    input_file_path, in_file_basename, input_file_ext = sep_path_basename_ext(
        pwd_input_file)

    blast_results_best_hit = '%s/%s_KEGG_wd/%s_blast_best_hits.tab' % (
        op_dir, in_file_basename, in_file_basename)
    KO_assignment_file_D = '%s/%s_KEGG_wd/%s_KO_assignment_D.txt' % (
        op_dir, in_file_basename, in_file_basename)
    KO_assignment_file_DCBA = '%s/%s_KEGG_wd/%s_ko_assignment_ABCD.txt' % (
        op_dir, in_file_basename, in_file_basename)

    stats_file_A_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_B_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_C_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_D_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber.txt' % (
        op_dir, in_file_basename, in_file_basename)

    stats_file_A_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_B_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_C_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_D_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth.txt' % (
        op_dir, in_file_basename, in_file_basename)

    stats_file_A_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber_pct.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_B_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber_pct.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_C_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber_pct.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_D_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber_pct.txt' % (
        op_dir, in_file_basename, in_file_basename)

    stats_file_A_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth_pct.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_B_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth_pct.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_C_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth_pct.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_D_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth_pct.txt' % (
        op_dir, in_file_basename, in_file_basename)

    stats_file_A_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber_pct_by_all.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_B_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber_pct_by_all.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_C_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber_pct_by_all.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_D_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber_pct_by_all.txt' % (
        op_dir, in_file_basename, in_file_basename)

    stats_file_A_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth_pct_by_all.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_B_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth_pct_by_all.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_C_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth_pct_by_all.txt' % (
        op_dir, in_file_basename, in_file_basename)
    stats_file_D_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth_pct_by_all.txt' % (
        op_dir, in_file_basename, in_file_basename)

    ################################################# parse blast results ##################################################

    if run_blast is True:

        # store blast results in dict
        query_to_db_seq_dict = {}
        for each_query in open(blast_results_best_hit):
            each_query_split = each_query.strip().split('\t')
            query_id = each_query_split[0]
            db_seq = each_query_split[1]
            query_to_db_seq_dict[query_id] = db_seq

        # get all query sequence id
        query_seq_id_list = []
        for each_seq in SeqIO.parse(pwd_input_file, 'fasta'):
            query_seq_id_list.append(str(each_seq.id))

        # get ko id at level D for all query genes
        KO_assignment_file_handle = open(KO_assignment_file_D, 'w')
        for each_query_seq in sorted(query_seq_id_list):

            if each_query_seq in query_to_db_seq_dict:
                db_hit_id = query_to_db_seq_dict[each_query_seq]

                if db_hit_id in db_seq_to_KO_dict:
                    db_hit_id_ko = db_seq_to_KO_dict[db_hit_id]
                    if ',' in db_hit_id_ko:
                        db_hit_id_ko_split = db_hit_id_ko.split(',')
                        for each_db_hit_id_ko in db_hit_id_ko_split:
                            KO_assignment_file_handle.write(
                                '%s\t%s\n' %
                                (each_query_seq, each_db_hit_id_ko))
                    else:
                        KO_assignment_file_handle.write(
                            '%s\t%s\n' % (each_query_seq, db_hit_id_ko))
                else:
                    KO_assignment_file_handle.write('%s\n' % (each_query_seq))
            else:
                KO_assignment_file_handle.write('%s\n' % (each_query_seq))
        KO_assignment_file_handle.close()

    else:
        KO_assignment_file_D = pwd_input_file

    # get ko id at all levels for all query genes
    ko_assign_ABCD_handle = open(KO_assignment_file_DCBA, 'w')
    ko_assign_ABCD_handle.write(
        'Gene_id\tko_A\tko_B\tko_C\tko_D\tDesc_A\tDesc_B\tDesc_C\tDesc_D\n')
    query_seq_id_all = set()
    genes_with_ko = set()
    for query_gene in open(KO_assignment_file_D):
        query_gene_split = query_gene.strip().split('\t')
        gene_ID = query_gene_split[0]

        if len(query_gene_split) == 1:
            query_seq_id_all.add(query_gene_split[0])
            ko_assign_ABCD_handle.write('%s\n' % gene_ID)

        if len(query_gene_split) == 2:
            query_seq_id_all.add(query_gene_split[0])
            genes_with_ko.add(query_gene_split[0])
            KO_ID = query_gene_split[1]
            if KO_ID in D2ABCD_dict:
                KO_ID_ABCD = D2ABCD_dict[KO_ID]

                if len(KO_ID_ABCD) == 1:
                    KO_DCBA_list = KO_ID_ABCD[0].split('|')[::-1]
                    KO_DCBA_list_only_id = [
                        i.split('_')[1] for i in KO_DCBA_list
                    ]
                    desc_A = As_description_dict[KO_DCBA_list_only_id[3]]
                    desc_B = Bs_description_dict[KO_DCBA_list_only_id[2]]
                    desc_C = Cs_description_dict[KO_DCBA_list_only_id[1]]
                    desc_D = Ds_description_dict[KO_DCBA_list_only_id[0]]
                    ko_assign_ABCD_handle.write(
                        '%s\t%s\t%s\t%s\t%s\t%s\n' %
                        (gene_ID, '\t'.join(KO_DCBA_list[::-1]), desc_A,
                         desc_B, desc_C, desc_D))

                if len(KO_ID_ABCD) > 1:
                    for each_ABCD in KO_ID_ABCD:
                        each_KO_DCBA_list = each_ABCD.split('|')[::-1]
                        each_KO_DCBA_list_only_id = [
                            i.split('_')[1] for i in each_KO_DCBA_list
                        ]
                        each_desc_A = As_description_dict[
                            each_KO_DCBA_list_only_id[3]]
                        each_desc_B = Bs_description_dict[
                            each_KO_DCBA_list_only_id[2]]
                        each_desc_C = Cs_description_dict[
                            each_KO_DCBA_list_only_id[1]]
                        each_desc_D = Ds_description_dict[
                            each_KO_DCBA_list_only_id[0]]
                        ko_assign_ABCD_handle.write(
                            '%s\t%s\t%s\t%s\t%s\t%s\n' %
                            (gene_ID, '\t'.join(
                                each_KO_DCBA_list[::-1]), each_desc_A,
                             each_desc_B, each_desc_C, each_desc_D))

    ko_assign_ABCD_handle.close()

    ##################################################### Get summary ######################################################

    # read in depth info
    gene_depth_dict = {}
    if depth_file is not None:
        for each_depth in open(depth_file):
            each_depth_split = each_depth.strip().split('\t')
            gene_depth_dict[each_depth_split[0]] = float(each_depth_split[1])

    # get total number and depth of all genes in one file
    total_depth_for_all_query_genes = 0
    genes_with_ko_TotalDepth = 0
    if depth_file is not None:
        for gene in query_seq_id_all:
            gene_depth = gene_depth_dict[gene]
            total_depth_for_all_query_genes += gene_depth

        genes_with_ko_TotalDepth = get_gene_list_TotalDepth(
            genes_with_ko, gene_depth_dict)

    identified_ko_A_list = []
    identified_ko_B_list = []
    identified_ko_C_list = []
    identified_ko_D_list = []
    ko_A_to_gene_member_dict = {}
    ko_B_to_gene_member_dict = {}
    ko_C_to_gene_member_dict = {}
    ko_D_to_gene_member_dict = {}
    ko_NA_to_gene_member_list = []
    for each_query in open(KO_assignment_file_DCBA):
        if not each_query.startswith('Gene_id'):
            each_query_split = each_query.strip().split('\t')
            query_id = each_query_split[0]

            if len(each_query_split) == 1:
                ko_NA_to_gene_member_list.append(query_id)

            if len(each_query_split) > 1:
                query_ko_A = each_query_split[1]
                query_ko_B = each_query_split[2]
                query_ko_C = each_query_split[3]
                query_ko_D = each_query_split[4]

                if query_ko_A not in identified_ko_A_list:
                    identified_ko_A_list.append(query_ko_A)
                if query_ko_B not in identified_ko_B_list:
                    identified_ko_B_list.append(query_ko_B)
                if query_ko_C not in identified_ko_C_list:
                    identified_ko_C_list.append(query_ko_C)
                if query_ko_D not in identified_ko_D_list:
                    identified_ko_D_list.append(query_ko_D)

                if query_ko_A not in ko_A_to_gene_member_dict:
                    ko_A_to_gene_member_dict[query_ko_A] = [query_id]
                else:
                    if query_id not in ko_A_to_gene_member_dict[query_ko_A]:
                        ko_A_to_gene_member_dict[query_ko_A].append(query_id)

                if query_ko_B not in ko_B_to_gene_member_dict:
                    ko_B_to_gene_member_dict[query_ko_B] = [query_id]
                else:
                    if query_id not in ko_B_to_gene_member_dict[query_ko_B]:
                        ko_B_to_gene_member_dict[query_ko_B].append(query_id)

                if query_ko_C not in ko_C_to_gene_member_dict:
                    ko_C_to_gene_member_dict[query_ko_C] = [query_id]
                else:
                    if query_id not in ko_C_to_gene_member_dict[query_ko_C]:
                        ko_C_to_gene_member_dict[query_ko_C].append(query_id)

                if query_ko_D not in ko_D_to_gene_member_dict:
                    ko_D_to_gene_member_dict[query_ko_D] = [query_id]
                else:
                    if query_id not in ko_D_to_gene_member_dict[query_ko_D]:
                        ko_D_to_gene_member_dict[query_ko_D].append(query_id)

    #################### write out GeneNumber and TotalDepth stats ####################

    write_out_stats_GeneNumber(identified_ko_A_list, ko_A_to_gene_member_dict,
                               As_description_dict, stats_file_A_GeneNumber)
    write_out_stats_GeneNumber(identified_ko_B_list, ko_B_to_gene_member_dict,
                               Bs_description_dict, stats_file_B_GeneNumber)
    write_out_stats_GeneNumber(identified_ko_C_list, ko_C_to_gene_member_dict,
                               Cs_description_dict, stats_file_C_GeneNumber)
    write_out_stats_GeneNumber(identified_ko_D_list, ko_D_to_gene_member_dict,
                               Ds_description_dict, stats_file_D_GeneNumber)
    if depth_file is not None:
        write_out_stats_TotalDepth(identified_ko_A_list,
                                   ko_A_to_gene_member_dict, gene_depth_dict,
                                   As_description_dict,
                                   stats_file_A_TotalDepth)
        write_out_stats_TotalDepth(identified_ko_B_list,
                                   ko_B_to_gene_member_dict, gene_depth_dict,
                                   Bs_description_dict,
                                   stats_file_B_TotalDepth)
        write_out_stats_TotalDepth(identified_ko_C_list,
                                   ko_C_to_gene_member_dict, gene_depth_dict,
                                   Cs_description_dict,
                                   stats_file_C_TotalDepth)
        write_out_stats_TotalDepth(identified_ko_D_list,
                                   ko_D_to_gene_member_dict, gene_depth_dict,
                                   Ds_description_dict,
                                   stats_file_D_TotalDepth)

    #################### write out GeneNumber and TotalDepth stats (pct) ####################

    AnnotateNorm(stats_file_A_GeneNumber, True, 2, len(genes_with_ko),
                 stats_file_A_GeneNumber_pct,
                 'KO\tGeneNumber_pct\tDescription\n')
    AnnotateNorm(stats_file_B_GeneNumber, True, 2, len(genes_with_ko),
                 stats_file_B_GeneNumber_pct,
                 'KO\tGeneNumber_pct\tDescription\n')
    AnnotateNorm(stats_file_C_GeneNumber, True, 2, len(genes_with_ko),
                 stats_file_C_GeneNumber_pct,
                 'KO\tGeneNumber_pct\tDescription\n')
    AnnotateNorm(stats_file_D_GeneNumber, True, 2, len(genes_with_ko),
                 stats_file_D_GeneNumber_pct,
                 'KO\tGeneNumber_pct\tDescription\n')
    if depth_file is not None:
        AnnotateNorm(stats_file_A_TotalDepth, True, 2,
                     genes_with_ko_TotalDepth, stats_file_A_TotalDepth_pct,
                     'KO\tTotalDepth_pct\tDescription\n')
        AnnotateNorm(stats_file_B_TotalDepth, True, 2,
                     genes_with_ko_TotalDepth, stats_file_B_TotalDepth_pct,
                     'KO\tTotalDepth_pct\tDescription\n')
        AnnotateNorm(stats_file_C_TotalDepth, True, 2,
                     genes_with_ko_TotalDepth, stats_file_C_TotalDepth_pct,
                     'KO\tTotalDepth_pct\tDescription\n')
        AnnotateNorm(stats_file_D_TotalDepth, True, 2,
                     genes_with_ko_TotalDepth, stats_file_D_TotalDepth_pct,
                     'KO\tTotalDepth_pct\tDescription\n')

    #################### write out GeneNumber and TotalDepth stats (pct_by_all) ####################

    if pct_by_all is True:
        AnnotateNorm(stats_file_A_GeneNumber, True, 2, len(query_seq_id_all),
                     stats_file_A_GeneNumber_pct_by_all,
                     'KO\tGeneNumber_pct_by_all\tDescription\n')
        AnnotateNorm(stats_file_B_GeneNumber, True, 2, len(query_seq_id_all),
                     stats_file_B_GeneNumber_pct_by_all,
                     'KO\tGeneNumber_pct_by_all\tDescription\n')
        AnnotateNorm(stats_file_C_GeneNumber, True, 2, len(query_seq_id_all),
                     stats_file_C_GeneNumber_pct_by_all,
                     'KO\tGeneNumber_pct_by_all\tDescription\n')
        AnnotateNorm(stats_file_D_GeneNumber, True, 2, len(query_seq_id_all),
                     stats_file_D_GeneNumber_pct_by_all,
                     'KO\tGeneNumber_pct_by_all\tDescription\n')
        if depth_file is not None:
            AnnotateNorm(stats_file_A_TotalDepth, True, 2,
                         total_depth_for_all_query_genes,
                         stats_file_A_TotalDepth_pct_by_all,
                         'KO\tTotalDepth_pct_by_all\tDescription\n')
            AnnotateNorm(stats_file_B_TotalDepth, True, 2,
                         total_depth_for_all_query_genes,
                         stats_file_B_TotalDepth_pct_by_all,
                         'KO\tTotalDepth_pct_by_all\tDescription\n')
            AnnotateNorm(stats_file_C_TotalDepth, True, 2,
                         total_depth_for_all_query_genes,
                         stats_file_C_TotalDepth_pct_by_all,
                         'KO\tTotalDepth_pct_by_all\tDescription\n')
            AnnotateNorm(stats_file_D_TotalDepth, True, 2,
                         total_depth_for_all_query_genes,
                         stats_file_D_TotalDepth_pct_by_all,
                         'KO\tTotalDepth_pct_by_all\tDescription\n')
Пример #8
0
def Annotation_KEGG(args):

    input_file_faa = args['seq_in']
    input_file_user_ko = args['ko_in']
    file_extension = args['x']
    depth_file = args['depth']
    pct_by_all = args['pct_by_all']
    KEGG_DB_folder = args['db_dir']
    run_diamond = args['diamond']
    num_threads = args['t']
    evalue_cutoff = args['evalue']

    run_blast = None
    if (input_file_faa is not None) and (input_file_user_ko is None):
        run_blast = True
    elif (input_file_faa is None) and (input_file_user_ko is not None):
        run_blast = False
    else:
        print(
            datetime.now().strftime(time_format) +
            'Please provide input file with either "-seq_in" or "-ko_in", do not provide both'
        )
        exit()

    if run_blast is True:
        input_file_folder = input_file_faa
    else:
        input_file_folder = input_file_user_ko

    # check whether input file/folder exist
    if (os.path.isfile(input_file_folder) is
            False) and (os.path.isdir(input_file_folder) is False):
        print(datetime.now().strftime(time_format) +
              'input file/folder not found, program exited')
        exit()

    if run_blast is True:
        print(datetime.now().strftime(time_format) +
              'Input sequence file detected, will run blastp/diamond first')
        sleep(0.5)
    else:
        print(datetime.now().strftime(time_format) +
              'Annotation results provided, blastp/diamond skipped')
        sleep(0.5)

    ################################################# define file name #################################################

    KEGG_DB_seq = '%s/kegg_db_seq.fasta' % KEGG_DB_folder
    KEGG_DB_seq_diamond = '%s/kegg_db_seq.fasta.dmnd' % KEGG_DB_folder
    KEGG_DB_seq2ko = '%s/kegg_db_seq2ko.txt' % KEGG_DB_folder
    KEGG_DB_ko = '%s/ko00001.keg' % KEGG_DB_folder

    ########################################## check whether diamond db exist ##########################################

    if (run_blast is True) and (run_diamond is True):
        if os.path.isfile(KEGG_DB_seq_diamond) is False:
            print(datetime.now().strftime(time_format) +
                  'DB file not found, making diamond db with %s' % KEGG_DB_seq)

            if os.path.isfile(KEGG_DB_seq) is True:
                diamond_makedb_cmd = 'diamond makedb --in %s --db %s --quiet' % (
                    KEGG_DB_seq, KEGG_DB_seq_diamond)
                os.system(diamond_makedb_cmd)
            else:
                print(datetime.now().strftime(time_format) +
                      '%s not found, program exited' % KEGG_DB_seq)
                exit()

    ########################################### check whether blast+ db exist ##########################################

    if (run_blast is True) and (run_diamond is False):

        unfound_db_index_file = []
        for db_index in [
                'phr', 'pin', 'pnd', 'pni', 'pog', 'psd', 'psi', 'psq'
        ]:
            pwd_db_index = '%s/kegg_db_seq.fasta.%s' % (KEGG_DB_folder,
                                                        db_index)
            if not os.path.isfile(pwd_db_index):
                unfound_db_index_file.append(db_index)
        if len(unfound_db_index_file) > 0:
            print(datetime.now().strftime(time_format) +
                  'blast db index not found, runing makeblastdb first')
            makeblastdb_cmd = 'makeblastdb -in %s -dbtype prot -parse_seqids -logfile %s.log' % (
                KEGG_DB_seq, KEGG_DB_seq)
            os.system(makeblastdb_cmd)
            print(datetime.now().strftime(time_format) +
                  'makeblastdb finished')

    ######################################### Run blastp with multiprocessing ##########################################

    # check whether the input file is a file or folder
    if os.path.isfile(input_file_folder) is True:
        input_file_path, input_file_basename, input_file_ext = sep_path_basename_ext(
            input_file_folder)
        run_blast_worker([
            input_file_folder, run_blast, run_diamond, KEGG_DB_seq,
            KEGG_DB_seq_diamond, input_file_path, evalue_cutoff, num_threads
        ])

    if os.path.isdir(input_file_folder) is True:

        # create output folder
        output_folder = '%s_KEGG_wd' % input_file_folder
        force_create_folder(output_folder)

        # check whether input genome exist
        input_file_re = '%s/*.%s' % (input_file_folder, file_extension)
        input_file_name_list = [
            os.path.basename(file_name)
            for file_name in glob.glob(input_file_re)
        ]

        if len(input_file_name_list) == 0:
            print(datetime.now().strftime(time_format) +
                  'input file not found, program exited')
            exit()

        # run blastp with multiprocessing
        if run_blast is True:
            print(datetime.now().strftime(time_format) +
                  'Running Blast/Diamond for %s input files with %s cores' %
                  (len(input_file_name_list), num_threads))

        list_for_multiple_arguments_blast = []
        for input_file in input_file_name_list:
            pwd_input_file = '%s/%s' % (input_file_folder, input_file)
            list_for_multiple_arguments_blast.append([
                pwd_input_file, run_blast, run_diamond, KEGG_DB_seq,
                KEGG_DB_seq_diamond, output_folder, evalue_cutoff, 1
            ])

        # run blastp with multiprocessing
        pool = mp.Pool(processes=num_threads)
        pool.map(run_blast_worker, list_for_multiple_arguments_blast)
        pool.close()
        pool.join()

    ############################################## Read in KEGG DB files ###############################################

    print(datetime.now().strftime(time_format) + 'Read in KEGG DB files')

    As_description_dict = {}
    Bs_description_dict = {}
    Cs_description_dict = {}
    Ds_description_dict = {}
    D2ABCD_dict = {}
    current_A = ''
    current_B = ''
    current_C = ''
    for each_line in open(KEGG_DB_ko):
        if each_line[0] in ['A', 'B', 'C', 'D']:
            each_line_split = each_line.strip().split(' ')

            if each_line[0] == 'A':
                current_A_id = each_line_split[0]
                current_A_description = ' '.join(each_line_split[1:])
                current_A = current_A_id
                As_description_dict[current_A_id] = current_A_description

            elif each_line[0] == 'B':
                if len(each_line_split) > 1:
                    current_B_id = each_line_split[2]
                    current_B_description = ' '.join(each_line_split[3:])
                    current_B = current_B_id
                    Bs_description_dict[current_B_id] = current_B_description

            elif each_line[0] == 'C':
                current_C_id = each_line_split[4]
                current_C_description = ' '.join(each_line_split[5:])
                current_C = current_C_id
                Cs_description_dict[current_C_id] = current_C_description

            elif each_line[0] == 'D':
                current_D_id = each_line_split[6]
                current_D_description = ' '.join(each_line_split[7:])
                Ds_description_dict[current_D_id] = current_D_description
                ABCD_value = 'A_%s|B_%s|C_%s|D_%s' % (current_A, current_B,
                                                      current_C, current_D_id)
                if current_D_id not in D2ABCD_dict:
                    D2ABCD_dict[current_D_id] = [ABCD_value]
                elif (current_D_id
                      in D2ABCD_dict) and (ABCD_value
                                           not in D2ABCD_dict[current_D_id]):
                    D2ABCD_dict[current_D_id].append(ABCD_value)

    # get db_seq_to_KO_dict
    db_seq_to_KO_dict = {}
    if run_blast is True:
        for each_hit in open(KEGG_DB_seq2ko):
            each_hit_split = each_hit.strip().split('\t')
            db_seq = each_hit_split[0]
            hit_id_KO = each_hit_split[1]
            if hit_id_KO != '':
                db_seq_to_KO_dict[db_seq] = hit_id_KO

    ########################################################################################################################

    # check whether the input file is a file or folder
    if os.path.isfile(input_file_folder) is True:

        # check whether depth file exist
        if depth_file is not None:
            if os.path.isfile(depth_file) is False:
                print(datetime.now().strftime(time_format) +
                      'specified depth file not found, program exited!')
                exit()

        print(datetime.now().strftime(time_format) +
              'Running KEGG annotation for 1 file with %s cores' %
              (num_threads))
        input_file_path, input_file_basename, input_file_ext = sep_path_basename_ext(
            input_file_folder)
        parse_blast_op_worker([
            input_file_folder, run_blast, As_description_dict,
            Bs_description_dict, Cs_description_dict, Ds_description_dict,
            D2ABCD_dict, db_seq_to_KO_dict, input_file_path, depth_file,
            pct_by_all
        ])

    if os.path.isdir(input_file_folder) is True:

        input_file_re = '%s/*.%s' % (input_file_folder, file_extension)
        input_file_name_list = [
            os.path.basename(file_name)
            for file_name in glob.glob(input_file_re)
        ]

        # check whether depth file exist
        if depth_file is not None:

            if os.path.isfile(depth_file) is True:
                print(
                    datetime.now().strftime(time_format) +
                    'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.'
                )
                print(
                    datetime.now().strftime(time_format) +
                    'single depth file (not folder) detected, program exited!')
                exit()

            if os.path.isdir(depth_file) is False:
                print(datetime.now().strftime(time_format) +
                      'specified depth folder not found, program exited!')
                exit()

            if os.path.isdir(depth_file) is True:

                undetected_depth_file = []
                for input_seq_file in input_file_name_list:
                    input_seq_file_basename = '.'.join(
                        input_seq_file.split('.')[:-1])
                    input_seq_file_depth = '%s/%s.depth' % (
                        depth_file, input_seq_file_basename)
                    if os.path.isfile(input_seq_file_depth) is False:
                        undetected_depth_file.append(input_seq_file_depth)

                if len(undetected_depth_file) > 0:
                    print(
                        datetime.now().strftime(time_format) +
                        'the following depth files not found, program exited!')
                    print(','.join(undetected_depth_file))
                    exit()

        # create output folder
        output_folder = '%s_KEGG_wd' % input_file_folder
        input_folder_name = input_file_folder
        if '/' in input_file_folder:
            input_folder_name = input_file_folder.split('/')[-1]

        # parse blast results with multiprocessing
        if run_blast is True:
            print(
                datetime.now().strftime(time_format) +
                'Parsing Blast/Diamond results for %s input files with %s cores'
                % (len(input_file_name_list), num_threads))

        list_for_multiple_arguments_parse_blast_op = []
        for input_file in input_file_name_list:

            input_file_basename = '.'.join(input_file.split('.')[:-1])
            pwd_input_file = '%s/%s' % (input_file_folder, input_file)

            # get path to current depth file
            if depth_file is None:
                input_file_depth = None
            else:
                input_file_depth = '%s/%s.depth' % (depth_file,
                                                    input_file_basename)

            list_for_multiple_arguments_parse_blast_op.append([
                pwd_input_file, run_blast, As_description_dict,
                Bs_description_dict, Cs_description_dict, Ds_description_dict,
                D2ABCD_dict, db_seq_to_KO_dict, output_folder,
                input_file_depth, pct_by_all
            ])

        # parse blast results with multiprocessing
        pool = mp.Pool(processes=num_threads)
        pool.map(parse_blast_op_worker,
                 list_for_multiple_arguments_parse_blast_op)
        pool.close()
        pool.join()

        ######################################################### get dataframe #########################################################

        print(datetime.now().strftime(time_format) +
              'Data matrix exported to:')

        for ko_level in ['A', 'B', 'C', 'D']:
            annotation_df_GeneNumber = '%s/%s_%s_GeneNumber.txt' % (
                output_folder, input_folder_name, ko_level)
            annotation_df_GeneNumber_pct = '%s/%s_%s_GeneNumber_pct.txt' % (
                output_folder, input_folder_name, ko_level)
            annotation_df_GeneNumber_pct_by_all = '%s/%s_%s_GeneNumber_pct_by_all.txt' % (
                output_folder, input_folder_name, ko_level)
            annotation_df_TotalDepth = '%s/%s_%s_TotalDepth.txt' % (
                output_folder, input_folder_name, ko_level)
            annotation_df_TotalDepth_pct = '%s/%s_%s_TotalDepth_pct.txt' % (
                output_folder, input_folder_name, ko_level)
            annotation_df_TotalDepth_pct_by_all = '%s/%s_%s_TotalDepth_pct_by_all.txt' % (
                output_folder, input_folder_name, ko_level)

            #################### get GeneNumber df and report ####################

            get_KEGG_annot_df(output_folder,
                              ko_level,
                              annotation_df_GeneNumber,
                              annotation_df_GeneNumber_pct,
                              annotation_df_GeneNumber_pct_by_all,
                              with_depth=False,
                              pct_by_all=pct_by_all)

            print(annotation_df_GeneNumber.split('/')[-1])
            print(annotation_df_GeneNumber_pct.split('/')[-1])
            if pct_by_all is True:
                print(annotation_df_GeneNumber_pct_by_all.split('/')[-1])

            #################### get TotalDepth df and report ####################

            if depth_file is not None:
                get_KEGG_annot_df(output_folder,
                                  ko_level,
                                  annotation_df_TotalDepth,
                                  annotation_df_TotalDepth_pct,
                                  annotation_df_TotalDepth_pct_by_all,
                                  with_depth=True,
                                  pct_by_all=pct_by_all)

                print(annotation_df_TotalDepth.split('/')[-1])
                print(annotation_df_TotalDepth_pct.split('/')[-1])
                if pct_by_all is True:
                    print(annotation_df_TotalDepth_pct_by_all.split('/')[-1])

    ################################################## Final report ####################################################

    print(datetime.now().strftime(time_format) + 'Done!')
Пример #9
0
def get_ref_to_read_num_from_sam(input_sam_file, output_stats_file):

    output_stats_file_path, output_stats_file_basename, output_stats_file_extension = sep_path_basename_ext(
        output_stats_file)
    output_stats_tmp = '%s/%s_tmp%s' % (output_stats_file_path,
                                        output_stats_file_basename,
                                        output_stats_file_extension)

    # Store reads num in dict
    ref2read_num_dict = {}
    for each_read in open(input_sam_file):
        if not each_read.startswith('@'):
            ref_id = each_read.strip().split('\t')[2]

            if ref_id not in ref2read_num_dict:
                ref2read_num_dict[ref_id] = 1
            else:
                ref2read_num_dict[ref_id] += 1

    # Write reads num to file
    stat_file_unsorted_handle = open(output_stats_tmp, 'w')
    for each_ref in ref2read_num_dict:
        stat_file_unsorted_handle.write(
            '%s\t%s\n' % (each_ref, ref2read_num_dict[each_ref]))
    stat_file_unsorted_handle.close()

    # sort output file
    os.system('cat %s | sort > %s' % (output_stats_tmp, output_stats_file))

    # remove tmp file
    os.system('rm %s' % output_stats_tmp)
Пример #10
0
def get_bin_abundance(args):

    ################################################# read in arguments ################################################

    sam_file = args['sam']
    bin_folder = args['bin']
    bin_ext = args['x']
    output_file = args['o']
    cluster_info = args['g']
    dRep_Cdb_file = args['Cdb']

    ############################################## define bin_cluster file #############################################

    bin_cluster_file = ''
    if (cluster_info is None) and (dRep_Cdb_file is None):
        bin_cluster_file = None

    elif (cluster_info is not None) and (dRep_Cdb_file is None):
        bin_cluster_file = cluster_info

    elif (cluster_info is None) and (dRep_Cdb_file is not None):

        Cdb_file_path, Cdb_file_basename, Cdb_file_extension = sep_path_basename_ext(
            dRep_Cdb_file)
        cluster_file_from_Cdb = '%s/%s_derived_cluster_file_%s%s' % (
            Cdb_file_path, Cdb_file_basename,
            datetime.now().strftime('%Y-%m-%d_%Hh-%Mm-%Ss_%f'),
            Cdb_file_extension)
        Cdb_2_bin_cluster_file(dRep_Cdb_file, cluster_file_from_Cdb)
        bin_cluster_file = cluster_file_from_Cdb

    else:
        print(
            datetime.now().strftime(time_format) +
            'cluster_info and dRep_Cdb are not compatible, please specify one only, program exited!'
        )
        exit()

    ################################################ get bin to ctg dict ###############################################

    bin_file_re = '%s/*%s' % (bin_folder, bin_ext)
    bin_file_list = [
        os.path.basename(file_name) for file_name in glob.glob(bin_file_re)
    ]

    if len(bin_file_list) == 0:
        print(datetime.now().strftime(time_format) +
              'No bin file found, program exited!')
        exit()

    bin_2_ctg_dict = {}
    for each_bin in bin_file_list:
        pwd_each_bin = '%s/%s' % (bin_folder, each_bin)
        bin_2_ctg_dict[each_bin] = set()
        for seq in SeqIO.parse(pwd_each_bin, 'fasta'):
            bin_2_ctg_dict[each_bin].add(seq.id)

    ############################################ get group to ctg list dict ############################################

    print(datetime.now().strftime(time_format) +
          'Get bin (cluster) to contig correlations')

    if bin_cluster_file is None:
        group_2_ctg_dict = bin_2_ctg_dict
    else:
        # get group to bin dict
        group_2_bin_dict = {}
        for group in open(bin_cluster_file):
            group_split = group.strip().split('\t')
            group_2_bin_dict[group_split[0]] = group_split[1:]

        group_2_ctg_dict = {}
        for bin_group in group_2_bin_dict:
            group_member_list = group_2_bin_dict[bin_group]
            group_2_ctg_dict[bin_group] = set()
            for genome_bin in group_member_list:
                genome_bin_ctg_list = bin_2_ctg_dict[genome_bin]
                for ctg in genome_bin_ctg_list:
                    group_2_ctg_dict[bin_group].add(ctg)

    ########################################### get_ref_to_read_num_from_sam ###########################################

    print(
        datetime.now().strftime(time_format) +
        'Get the number of reads mapped to each reference sequence in sam file'
    )

    sam_file_path, sam_file_basename, sam_file_extension = sep_path_basename_ext(
        sam_file)
    ref_to_read_num_file = '%s/%s_ref_to_read_num_%s.txt' % (
        sam_file_path, sam_file_basename,
        datetime.now().strftime('%Y-%m-%d_%Hh-%Mm-%Ss_%f'))

    get_ref_to_read_num_from_sam(sam_file, ref_to_read_num_file)

    ########################################### read in ref_to_read_num_file ###########################################

    mapped_reads_num = 0
    ref_to_read_num_dict = {}
    for each_ctg in open(ref_to_read_num_file):
        each_ctg_split = each_ctg.strip().split('\t')
        ctg_id = each_ctg_split[0]
        read_num = int(each_ctg_split[1])
        ref_to_read_num_dict[ctg_id] = read_num
        if ctg_id != '*':
            mapped_reads_num += read_num

    ###################################### get the number of reads in each group #######################################

    print(datetime.now().strftime(time_format) +
          'Get the number of reads mapped to each bin (cluster)')

    group_to_read_num_dict = {}
    for group in group_2_ctg_dict:
        group_ctg_list = group_2_ctg_dict[group]
        group_to_read_num_dict[group] = 0
        for ctg in group_ctg_list:
            group_to_read_num_dict[group] += ref_to_read_num_dict.get(ctg, 0)

    output_file_handle = open(output_file, 'w')
    output_file_handle.write('cluster\tread_num\tread_pct\n')
    for group in group_to_read_num_dict:
        group_read_num = group_to_read_num_dict[group]
        group_read_pct = float("{0:.2f}".format(group_read_num * 100 /
                                                mapped_reads_num))
        output_file_handle.write('%s\t%s\t%s\n' %
                                 (group, group_read_num, group_read_pct))
    output_file_handle.close()

    ################################################## final report ####################################################

    # delete tmp files
    os.system('rm %s' % ref_to_read_num_file)
    if (cluster_info is None) and (dRep_Cdb_file is not None):
        os.system('rm %s' % bin_cluster_file)

    # final report
    print(datetime.now().strftime(time_format) + 'Done!')
Пример #11
0
def COG2020(args):

    file_in = args['i']
    file_extension = args['x']
    sequence_type = args['m']
    depth_file = args['depth']
    pct_by_all = args['pct_by_all']
    DB_dir = args['db_dir']
    num_threads = args['t']
    run_diamond = args['diamond']
    evalue_cutoff = args['evalue']

    pwd_cog_20_fa = '%s/cog-20.fa' % DB_dir
    pwd_cog_20_fa_diamond = '%s/cog-20.fa.dmnd' % DB_dir
    pwd_cog_20_cog_csv = '%s/cog-20.cog.csv' % DB_dir
    pwd_cog_20_def_tab = '%s/cog-20.def.tab' % DB_dir
    pwd_fun_20_tab = '%s/fun-20.tab' % DB_dir

    ############################################ check whether db file exist ###########################################

    # check whether db file exist
    unfound_inputs = []
    for each_input in [pwd_cog_20_fa, pwd_cog_20_def_tab, pwd_fun_20_tab]:
        if (not os.path.isfile(each_input)) and (
                not os.path.isdir(each_input)):
            unfound_inputs.append(each_input)
    if len(unfound_inputs) > 0:
        for each_unfound in unfound_inputs:
            print('%s not found' % each_unfound)
        exit()

    if run_diamond is True:
        if os.path.isfile(pwd_cog_20_fa_diamond) is False:
            print(
                datetime.now().strftime(time_format) +
                'DB file for diamond not found, please refers to the help info for diamond db preparation'
            )
            print(datetime.now().strftime(time_format) + 'Program exited!')
            exit()

    ################################################# read db into dict ################################################

    # get protein_to_cog_dict (cog-20.cog.csv)
    protein_to_cog_dict = {}
    for each_line in open(pwd_cog_20_cog_csv):
        each_line_split = each_line.strip().split(',')
        protein_id = each_line_split[2]
        protein_id_no_dot = '_'.join(protein_id.split('.'))
        cog_id = each_line_split[6]
        if protein_id_no_dot not in protein_to_cog_dict:
            protein_to_cog_dict[protein_id_no_dot] = {cog_id}
        else:
            protein_to_cog_dict[protein_id_no_dot].add(cog_id)

    # get cog_id_to_category_dict and cog_id_to_description_dict (cognames2003-2014.tab)
    cog_id_to_category_dict = {}
    cog_id_to_description_dict = {}
    for cog_id_to_cate_des in open(pwd_cog_20_def_tab,
                                   encoding='windows-1252'):
        if not cog_id_to_cate_des.startswith('#'):
            cog_id_to_cate_des_split = cog_id_to_cate_des.strip().split('\t')
            cog_id = cog_id_to_cate_des_split[0]
            cog_cate = cog_id_to_cate_des_split[1]
            cog_des = cog_id_to_cate_des_split[2]
            cog_id_to_category_dict[cog_id] = cog_cate
            cog_id_to_description_dict[cog_id] = cog_des

    # get cog_category_to_description_dict (fun2003-2014.tab)
    cog_category_list = []
    cog_category_to_description_dict = {}
    for cog_category in open(pwd_fun_20_tab):
        if not cog_category.startswith('#'):
            cog_category_split = cog_category.strip().split('\t')
            cog_category_list.append(cog_category_split[0])
            cog_category_to_description_dict[
                cog_category_split[0]] = cog_category_split[1]

    ################################################## if input is file ################################################

    # if input is file
    if os.path.isfile(file_in) is True:

        # check whether depth file exist
        if depth_file is not None:
            if os.path.isfile(depth_file) is False:
                print(datetime.now().strftime(time_format) +
                      'specified depth file not found, program exited!')
                exit()

        print(datetime.now().strftime(time_format) +
              'Running COG annotation for 1 file with %s cores' %
              (num_threads))

        file_in_path, file_in_basename, file_in_ext = sep_path_basename_ext(
            file_in)

        COG2020_worker([
            file_in, pwd_cog_20_fa, protein_to_cog_dict,
            cog_id_to_category_dict, cog_id_to_description_dict,
            cog_category_list, cog_category_to_description_dict, sequence_type,
            file_in_path, num_threads, run_diamond, evalue_cutoff, depth_file,
            pct_by_all
        ])

    ################################################ if input is folder ################################################

    # if input is folder
    else:

        # check whether input folder exist
        if os.path.isdir(file_in) is False:
            print(datetime.now().strftime(time_format) +
                  'input folder not found, program exited')
            exit()

        else:
            # check whether input genome exist
            input_file_re = '%s/*.%s' % (file_in, file_extension)
            input_file_name_list = [
                os.path.basename(file_name)
                for file_name in glob.glob(input_file_re)
            ]

            if len(input_file_name_list) == 0:
                print(datetime.now().strftime(time_format) +
                      'input file not found, program exited')
                exit()

            # check whether depth file exist
            if depth_file is not None:

                if os.path.isfile(depth_file) is True:
                    print(
                        datetime.now().strftime(time_format) +
                        'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.'
                    )
                    print(
                        datetime.now().strftime(time_format) +
                        'a single file (not folder) detected, program exited!')
                    exit()

                if os.path.isdir(depth_file) is False:
                    print(datetime.now().strftime(time_format) +
                          'provided depth folder not found, program exited!')
                    exit()

                if os.path.isdir(depth_file) is True:

                    undetected_depth_file = []
                    for input_seq_file in input_file_name_list:
                        input_seq_file_basename = '.'.join(
                            input_seq_file.split('.')[:-1])
                        input_seq_file_depth = '%s/%s.depth' % (
                            depth_file, input_seq_file_basename)
                        if os.path.isfile(input_seq_file_depth) is False:
                            undetected_depth_file.append(input_seq_file_depth)

                    if len(undetected_depth_file) > 0:
                        print(
                            datetime.now().strftime(time_format) +
                            'the following depth files not found, program exited!'
                        )
                        print(','.join(undetected_depth_file))
                        exit()

            ################################################### define file name ###################################################

            if '/' in file_in:
                file_in_folder_name = file_in.split('/')[-1]
            else:
                file_in_folder_name = file_in

            output_folder = '%s_COG2020_wd' % file_in_folder_name

            # create output folder
            force_create_folder(output_folder)

            ######################################################### main #########################################################

            print(datetime.now().strftime(time_format) +
                  'Running COG annotation for %s files with %s cores' %
                  (len(input_file_name_list), num_threads))

            list_for_multiple_arguments_COG = []
            for input_file in input_file_name_list:

                input_file_basename = '.'.join(input_file.split('.')[:-1])
                pwd_input_file = '%s/%s' % (file_in, input_file)

                # get path to current depth file
                if depth_file is None:
                    input_file_depth = None
                else:
                    input_file_depth = '%s/%s.depth' % (depth_file,
                                                        input_file_basename)

                list_for_multiple_arguments_COG.append([
                    pwd_input_file, pwd_cog_20_fa, protein_to_cog_dict,
                    cog_id_to_category_dict, cog_id_to_description_dict,
                    cog_category_list, cog_category_to_description_dict,
                    sequence_type, output_folder, 1, run_diamond,
                    evalue_cutoff, input_file_depth, pct_by_all
                ])

            # run COG annotaion files with multiprocessing
            pool = mp.Pool(processes=num_threads)
            pool.map(COG2020_worker, list_for_multiple_arguments_COG)
            pool.close()
            pool.join()

            ######################################################### get dataframe #########################################################

            annotation_df_cog_cate_GeneNumber = '%s/%s_COG2020_cate_GeneNumber.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_GeneNumber_pct = '%s/%s_COG2020_cate_GeneNumber_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_GeneNumber_pct_by_all = '%s/%s_COG2020_cate_GeneNumber_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_cate_TotalDepth = '%s/%s_COG2020_cate_TotalDepth.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_TotalDepth_pct = '%s/%s_COG2020_cate_TotalDepth_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_TotalDepth_pct_by_all = '%s/%s_COG2020_cate_TotalDepth_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_id_GeneNumber = '%s/%s_COG2020_id_GeneNumber.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_GeneNumber_pct = '%s/%s_COG2020_id_GeneNumber_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_GeneNumber_pct_by_all = '%s/%s_COG2020_id_GeneNumber_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_id_TotalDepth = '%s/%s_COG2020_id_TotalDepth.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_TotalDepth_pct = '%s/%s_COG2020_id_TotalDepth_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_TotalDepth_pct_by_all = '%s/%s_COG2020_id_TotalDepth_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            print(datetime.now().strftime(time_format) +
                  'Data matrix exported to:')

            # get df
            get_COG_annot_df(output_folder,
                             'cog_cate',
                             annotation_df_cog_cate_GeneNumber,
                             annotation_df_cog_cate_GeneNumber_pct,
                             annotation_df_cog_cate_GeneNumber_pct_by_all,
                             with_depth=False,
                             pct_by_all=False)
            get_COG_annot_df(output_folder,
                             'cog_id',
                             annotation_df_cog_id_GeneNumber,
                             annotation_df_cog_id_GeneNumber_pct,
                             annotation_df_cog_id_GeneNumber_pct_by_all,
                             with_depth=False,
                             pct_by_all=False)
            if pct_by_all is True:
                get_COG_annot_df(output_folder,
                                 'cog_cate',
                                 annotation_df_cog_cate_GeneNumber,
                                 annotation_df_cog_cate_GeneNumber_pct,
                                 annotation_df_cog_cate_GeneNumber_pct_by_all,
                                 with_depth=False,
                                 pct_by_all=True)
                get_COG_annot_df(output_folder,
                                 'cog_id',
                                 annotation_df_cog_id_GeneNumber,
                                 annotation_df_cog_id_GeneNumber_pct,
                                 annotation_df_cog_id_GeneNumber_pct_by_all,
                                 with_depth=False,
                                 pct_by_all=True)

            # report
            if pct_by_all is False:
                print(datetime.now().strftime(time_format) + '%s and %s' %
                      (annotation_df_cog_id_GeneNumber.split('/')[-1],
                       annotation_df_cog_id_GeneNumber_pct.split('/')[-1]))
                print(datetime.now().strftime(time_format) + '%s and %s' %
                      (annotation_df_cog_cate_GeneNumber.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct.split('/')[-1]))
            else:
                print(
                    datetime.now().strftime(time_format) + '%s, %s and %s' %
                    (annotation_df_cog_id_GeneNumber.split('/')[-1],
                     annotation_df_cog_id_GeneNumber_pct.split('/')[-1],
                     annotation_df_cog_id_GeneNumber_pct_by_all.split('/')[-1])
                )
                print(datetime.now().strftime(time_format) + '%s, %s and %s' %
                      (annotation_df_cog_cate_GeneNumber.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct_by_all.split('/')
                       [-1]))

            if depth_file is not None:
                get_COG_annot_df(output_folder,
                                 'cog_cate',
                                 annotation_df_cog_cate_TotalDepth,
                                 annotation_df_cog_cate_TotalDepth_pct,
                                 annotation_df_cog_cate_TotalDepth_pct_by_all,
                                 with_depth=True,
                                 pct_by_all=False)
                get_COG_annot_df(output_folder,
                                 'cog_id',
                                 annotation_df_cog_id_TotalDepth,
                                 annotation_df_cog_id_TotalDepth_pct,
                                 annotation_df_cog_id_TotalDepth_pct_by_all,
                                 with_depth=True,
                                 pct_by_all=False)
                if pct_by_all is True:
                    get_COG_annot_df(
                        output_folder,
                        'cog_cate',
                        annotation_df_cog_cate_TotalDepth,
                        annotation_df_cog_cate_TotalDepth_pct,
                        annotation_df_cog_cate_TotalDepth_pct_by_all,
                        with_depth=True,
                        pct_by_all=True)
                    get_COG_annot_df(
                        output_folder,
                        'cog_id',
                        annotation_df_cog_id_TotalDepth,
                        annotation_df_cog_id_TotalDepth_pct,
                        annotation_df_cog_id_TotalDepth_pct_by_all,
                        with_depth=True,
                        pct_by_all=True)

                # report
                if pct_by_all is False:
                    print(datetime.now().strftime(time_format) + '%s and %s' %
                          (annotation_df_cog_id_TotalDepth.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct.split('/')[-1]))
                    print(
                        datetime.now().strftime(time_format) + '%s and %s' %
                        (annotation_df_cog_cate_TotalDepth.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct.split('/')[-1]))
                else:
                    print(datetime.now().strftime(time_format) +
                          '%s, %s and %s' %
                          (annotation_df_cog_id_TotalDepth.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct_by_all.split(
                               '/')[-1]))
                    print(
                        datetime.now().strftime(time_format) +
                        '%s, %s and %s' %
                        (annotation_df_cog_cate_TotalDepth.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct_by_all.split(
                             '/')[-1]))

    ################################################## Final report ####################################################

    print(datetime.now().strftime(time_format) + 'Done!')
Пример #12
0
def SILVA_for_BLCA(args):

    SILVA_db_file = args['SILVA_ssu']

    SILVA_db_file_path, SILVA_db_file_basename, SILVA_db_file_ext = sep_path_basename_ext(
        SILVA_db_file)
    file_out_sequence = '%s/%s_BLCAparsed.fasta' % (SILVA_db_file_path,
                                                    SILVA_db_file_basename)
    file_out_taxonomy = '%s/%s_BLCAparsed.taxonomy' % (SILVA_db_file_path,
                                                       SILVA_db_file_basename)

    rank_list = [
        'species', 'genus', 'family', 'order', 'class', 'phylum',
        'superkingdom'
    ]

    file_out_sequence_handle = open(file_out_sequence, 'w')
    file_out_taxonomy_handle = open(file_out_taxonomy, 'w')
    for SILVA_seq in SeqIO.parse(SILVA_db_file, 'fasta'):

        SILVA_seq_taxon = ' '.join(SILVA_seq.description.split(' ')[1:])

        if not SILVA_seq_taxon.startswith('Eukaryota'):

            SILVA_seq_taxon_split = SILVA_seq_taxon.split(';')

            if len(SILVA_seq_taxon_split) < 7:

                if len(SILVA_seq_taxon_split) == 6:
                    SILVA_seq_taxon_split.append('unidentified')
                if len(SILVA_seq_taxon_split) == 5:
                    SILVA_seq_taxon_split.append('unidentified')
                    SILVA_seq_taxon_split.append('unidentified')
                if len(SILVA_seq_taxon_split) == 4:
                    SILVA_seq_taxon_split.append('unidentified')
                    SILVA_seq_taxon_split.append('unidentified')
                    SILVA_seq_taxon_split.append('unidentified')
                if len(SILVA_seq_taxon_split) == 3:
                    SILVA_seq_taxon_split.append('unidentified')
                    SILVA_seq_taxon_split.append('unidentified')
                    SILVA_seq_taxon_split.append('unidentified')
                    SILVA_seq_taxon_split.append('unidentified')
                if len(SILVA_seq_taxon_split) == 2:
                    SILVA_seq_taxon_split.append('unidentified')
                    SILVA_seq_taxon_split.append('unidentified')
                    SILVA_seq_taxon_split.append('unidentified')
                    SILVA_seq_taxon_split.append('unidentified')
                    SILVA_seq_taxon_split.append('unidentified')

            if len(SILVA_seq_taxon_split) > 7:
                SILVA_seq_taxon_split = [
                    SILVA_seq_taxon_split[0], SILVA_seq_taxon_split[1],
                    SILVA_seq_taxon_split[2], SILVA_seq_taxon_split[3],
                    SILVA_seq_taxon_split[4], SILVA_seq_taxon_split[5],
                    ' '.join(SILVA_seq_taxon_split[6:])
                ]

            SILVA_seq_taxon_split_reverse = SILVA_seq_taxon_split[::-1]

            SILVA_seq_taxon_str = ''
            n = 0
            for taxon_rank in rank_list:
                SILVA_seq_taxon_str += '%s:%s;' % (
                    taxon_rank, SILVA_seq_taxon_split_reverse[n])
                n += 1

            # write out to taxonomy file
            file_out_taxonomy_handle.write('%s\t%s\n' %
                                           (SILVA_seq.id, SILVA_seq_taxon_str))

            # write out to sequence file
            file_out_sequence_handle.write('>%s\n' % SILVA_seq.id)
            file_out_sequence_handle.write('%s\n' % SILVA_seq.seq)

    file_out_sequence_handle.close()
    file_out_taxonomy_handle.close()
Пример #13
0
def get_gene_depth(args):

    gbk_file = args['gbk']
    gff_file = args['gff']
    ctg_depth_file = args['ctg_depth']
    id_column = args['id_column']
    depth_column = args['depth_column']
    skip_depth_file_header = args['skip_header']

    ################################################# check input file #################################################

    annotation_file = None
    if (gbk_file is None) and (gff_file is None):
        print(
            datetime.now().strftime(time_format) +
            'Please provide either a gbk file or a gff file, program exited!')
        exit()

    if (gbk_file is not None) and (gff_file is not None):
        print(
            datetime.now().strftime(time_format) +
            'Both gbk and gff file were provided, will parse annotation results from gff file'
        )
        annotation_file = gff_file

    if (gbk_file is not None) and (gff_file is None):
        annotation_file = gbk_file

    if (gbk_file is None) and (gff_file is not None):
        annotation_file = gff_file

    # define output gene deptp file name
    annotation_file_path, annotation_file_basename, annotation_file_extension = sep_path_basename_ext(
        annotation_file)
    pwd_gene_depth_file = '%s/%s.depth' % (annotation_file_path,
                                           annotation_file_basename)
    if os.path.isfile(pwd_gene_depth_file) is True:
        pwd_gene_depth_file = '%s/%s.depth.txt' % (annotation_file_path,
                                                   annotation_file_basename)

    ################################################ read in ctg depth #################################################

    ctg_depth_dict = {}
    line = 0
    for ctg in open(ctg_depth_file):
        ctg_split = ctg.strip().split('\t')

        if skip_depth_file_header is True:
            if line > 0:
                ctg_depth_dict[ctg_split[id_column - 1]] = float(
                    ctg_split[depth_column - 1])
        else:
            ctg_depth_dict[ctg_split[id_column - 1]] = float(
                ctg_split[depth_column - 1])

        line += 1

    ########################################### get gene depth with gbk file ###########################################

    if annotation_file == gbk_file:
        gene_depth_file_handle = open(pwd_gene_depth_file, 'w')
        for seq_record in SeqIO.parse(gbk_file, 'genbank'):
            seq_id = seq_record.id
            seq_depth = ctg_depth_dict[seq_id]
            for feature in seq_record.features:
                if (feature.type != 'source') and (feature.type !=
                                                   'assembly_gap'):
                    gene_id = feature.qualifiers['locus_tag'][0]
                    for_out = '%s\t%s\n' % (gene_id, seq_depth)
                    gene_depth_file_handle.write(for_out)
        gene_depth_file_handle.close()

    ########################################### get gene depth with gff file ###########################################

    if annotation_file == gff_file:
        gene_depth_file_handle = open(pwd_gene_depth_file, 'w')
        for each_line in open(gff_file):
            if not each_line.startswith('#'):
                each_line_split = each_line.strip().split('\t')
                if len(each_line_split) > 1:
                    seq_id = each_line_split[0]
                    seq_depth = ctg_depth_dict[seq_id]
                    gene_id = each_line_split[8].split(';')[0].split('ID=')[-1]
                    for_out = '%s\t%s\n' % (gene_id, seq_depth)
                    gene_depth_file_handle.write(for_out)
        gene_depth_file_handle.close()

    ###################################################### report ######################################################

    print(datetime.now().strftime(time_format) +
          'Gene depth exported to %s' % pwd_gene_depth_file)
    print(datetime.now().strftime(time_format) + 'Done!')
Пример #14
0
def NetEnzymes(args, config_dict):

    enzymes_list_file = args['enzymes']
    compound_list_file = args['compounds']
    interested_ko_id = args['ko']
    ignore_ec_with_hyphen = args['NoHyphen']
    to_skip_file = args['to_skip']
    plot_network = args['plot']
    label_font_size = args['lfs']
    node_size = args['ns']
    ko00001_keg = config_dict['ko00001_keg']
    db_file_with_ec = config_dict['MetaCyc_rxns_with_ec']

    ########################################################################################################################

    node_color_dict = {
        'enzyme': 'lightgreen',
        'substrate': 'grey',
        'product': 'grey'
    }

    skip_list = set()
    if to_skip_file is not None:
        for each_to_skip in open(to_skip_file):
            skip_list.add(each_to_skip.strip())

    compounds_to_include_list = 'all'
    if compound_list_file is not None:
        compounds_to_include_list = set()
        for compound_to_include in open(compound_list_file):
            compounds_to_include_list.add(compound_to_include.strip())

    # define output file name
    ec_file_no_path, ec_file_no_ext, ec_file_ext = sep_path_basename_ext(
        enzymes_list_file)

    if interested_ko_id is None:
        if ignore_ec_with_hyphen is True:
            output_graphml = '%s/%s_NoHyphen.graphml' % (ec_file_no_path,
                                                         ec_file_no_ext)
            output_plot = '%s/%s_NoHyphen.png' % (ec_file_no_path,
                                                  ec_file_no_ext)
        else:
            output_graphml = '%s/%s.graphml' % (ec_file_no_path,
                                                ec_file_no_ext)
            output_plot = '%s/%s.png' % (ec_file_no_path, ec_file_no_ext)
    else:
        if ignore_ec_with_hyphen is True:
            output_graphml = '%s/%s_ko%s_NoHyphen.graphml' % (
                ec_file_no_path, ec_file_no_ext, interested_ko_id)
            output_plot = '%s/%s_ko%s_NoHyphen.png' % (
                ec_file_no_path, ec_file_no_ext, interested_ko_id)
        else:
            output_graphml = '%s/%s_ko%s.graphml' % (
                ec_file_no_path, ec_file_no_ext, interested_ko_id)
            output_plot = '%s/%s_ko%s.png' % (ec_file_no_path, ec_file_no_ext,
                                              interested_ko_id)

    ########################################################################################################################

    interested_ec_list = []
    if interested_ko_id is not None:

        print(datetime.now().strftime(time_format) +
              'get ECs from interested KO')

        # read in KEGG db file
        KO_description_A_dict, KO_description_B_dict, KO_description_C_dict, KO_description_D_dict, D2ABCD_dict, ko2level_dict = get_ko2description_dict(
            ko00001_keg)

        # get ec list from interested KO category
        interested_ec_list = get_ec_of_interested_ko(
            D2ABCD_dict, KO_description_D_dict,
            ko2level_dict[interested_ko_id], interested_ko_id)

    # get identified_ec_list
    print(datetime.now().strftime(time_format) + 'read in provided ECs')

    identified_ec_list = set()
    for ec in open(enzymes_list_file):
        ec = ec.strip()
        if interested_ko_id is not None:
            if ec in interested_ec_list:
                if ignore_ec_with_hyphen is False:
                    identified_ec_list.add(ec)
                else:
                    if '-' not in ec:
                        identified_ec_list.add(ec)
        else:
            if ignore_ec_with_hyphen is False:
                identified_ec_list.add(ec)
            else:
                if '-' not in ec:
                    identified_ec_list.add(ec)

    # initialize a graph
    G = nx.DiGraph()

    print(datetime.now().strftime(time_format) +
          'add nodes and edges to network')

    # add node and edge
    for reaction in open(db_file_with_ec):
        ec_id = reaction.strip().split('\t')[0]

        if ec_id in identified_ec_list:
            parse_biological_raction(G, reaction, skip_list,
                                     compounds_to_include_list,
                                     node_color_dict)

    print(datetime.now().strftime(time_format) +
          'write out network to graphml file')

    # write out graphml
    nx.write_graphml(G, output_graphml)

    if plot_network is True:

        print(datetime.now().strftime(time_format) + 'plot network')

        # specify
        graph_layout = nx.layout.kamada_kawai_layout(
            G
        )  # kamada_kawai_layout, planar_layout, fruchterman_reingold_layout

        # turn node attributes into dict
        node_attributes_dict = {}
        for node in G.nodes(data=True):
            node_attributes_dict[node[0]] = node[1]

        print(datetime.now().strftime(time_format) + 'plot nodes')

        # plot node
        for node in G:
            nx.draw_networkx_nodes(
                G,
                graph_layout,
                nodelist=[node],
                node_size=node_size,
                node_color=node_attributes_dict[node]['color_map'],
                node_shape=node_attributes_dict[node]['shape'])

            # add customized node label
            # nx.draw_networkx_labels(g, graph_layout, nodelist=[node], font_size=8, font_color='black')

        #  all nodes label together
        nx.draw_networkx_labels(G,
                                graph_layout,
                                nodelist=G.nodes,
                                font_size=label_font_size,
                                font_color='black')

        print(datetime.now().strftime(time_format) + 'plot edges')

        # plot edges
        nx.draw_networkx_edges(G,
                               graph_layout,
                               width=0.5,
                               arrows=True,
                               arrowsize=6)

        # save plot
        plt.savefig(output_plot, dpi=300)
        plt.close()

    ########################################################################################################################

    # G_in_cytoscape_data = json_graph.cytoscape_data(G)
    # print(G_in_cytoscape_data)
    # G_in_cytoscape_graph = json_graph.cytoscape_graph(G_in_cytoscape_data)
    # print(G_in_cytoscape_data)

    print(datetime.now().strftime(time_format) + 'Done!')
Пример #15
0
def dbCAN(args):

    file_in = args['i']
    file_extension = args['x']
    sequence_type = args['m']
    depth_file = args['depth']
    DB_dir = args['db_dir']
    num_threads = args['t']

    pwd_hmmscan_parser = '%s/hmmscan-parser.sh' % DB_dir
    pwd_dbCAN_fam_HMMs = '%s/dbCAN-fam-HMMs.txt' % DB_dir
    CAZyDB_fam_activities = '%s/CAZyDB.fam-activities.txt' % DB_dir

    CAZyDB_fam_activities_07312019 = '%s/CAZyDB.07312019.fam-activities.txt' % DB_dir
    if (os.path.isfile(CAZyDB_fam_activities_07312019) is
            True) and (os.path.isfile(CAZyDB_fam_activities) is False):
        os.system('mv %s %s' %
                  (CAZyDB_fam_activities_07312019, CAZyDB_fam_activities))

    ############################################ check whether db file exist ###########################################

    # check whether db file exist
    unfound_inputs = []
    for each_input in [pwd_hmmscan_parser, pwd_dbCAN_fam_HMMs]:
        if (not os.path.isfile(each_input)) and (
                not os.path.isdir(each_input)):
            unfound_inputs.append(each_input)
    if len(unfound_inputs) > 0:
        for each_unfound in unfound_inputs:
            print('%s not found' % each_unfound)
        exit()

    # store CAZyDB.fam-activities.txt in dict
    fam_to_activities_dict = {}
    for each_fam in open(CAZyDB_fam_activities):
        each_fam_split = each_fam.strip().split('	  ')
        if len(each_fam_split) == 2:
            fam_id = each_fam_split[0]
            fam_activities = each_fam_split[1]
            fam_to_activities_dict[fam_id] = fam_activities

    ################################################## if input is file ################################################

    # if input is file
    if os.path.isfile(file_in) is True:

        # check whether depth file exist
        if depth_file is not None:
            if os.path.isfile(depth_file) is False:
                print(datetime.now().strftime(time_format) +
                      'specified depth file not found, program exited!')
                exit()

        print(datetime.now().strftime(time_format) +
              'Running dbCAN for 1 file with %s cores' % (num_threads))

        file_in_path, file_in_basename, file_in_ext = sep_path_basename_ext(
            file_in)
        dbCAN_worker([
            file_in, pwd_hmmscan_parser, pwd_dbCAN_fam_HMMs, sequence_type,
            file_in_path, fam_to_activities_dict, depth_file
        ])

    ################################################ if input is folder ################################################

    # if input is folder
    else:

        # check whether input folder exist
        if os.path.isdir(file_in) is False:
            print(datetime.now().strftime(time_format) +
                  'input folder not found, program exited!')
            exit()

        else:
            # check whether input genome exist
            input_file_re = '%s/*.%s' % (file_in, file_extension)
            input_file_name_list = [
                os.path.basename(file_name)
                for file_name in glob.glob(input_file_re)
            ]

            if len(input_file_name_list) == 0:
                print(datetime.now().strftime(time_format) +
                      'input file not found, program exited!')
                exit()

            # check whether depth file exist
            if depth_file is not None:

                if os.path.isfile(depth_file) is True:
                    print(
                        datetime.now().strftime(time_format) +
                        'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.'
                    )
                    print(
                        datetime.now().strftime(time_format) +
                        'a single file (not folder) detected, program exited!')
                    exit()

                if os.path.isdir(depth_file) is False:
                    print(datetime.now().strftime(time_format) +
                          'provided depth folder not found, program exited!')
                    exit()

                if os.path.isdir(depth_file) is True:

                    undetected_depth_file = []
                    for input_seq_file in input_file_name_list:
                        input_seq_file_basename = '.'.join(
                            input_seq_file.split('.')[:-1])
                        input_seq_file_depth = '%s/%s.depth' % (
                            depth_file, input_seq_file_basename)
                        if os.path.isfile(input_seq_file_depth) is False:
                            undetected_depth_file.append(input_seq_file_depth)

                    if len(undetected_depth_file) > 0:
                        print(
                            datetime.now().strftime(time_format) +
                            'the following depth files not found, program exited!'
                        )
                        print(','.join(undetected_depth_file))
                        exit()

            ################################################### define file name ###################################################

            if '/' in file_in:
                file_in_folder_name = file_in.split('/')[-1]
            else:
                file_in_folder_name = file_in

            output_folder = '%s_dbCAN_wd' % file_in_folder_name
            annotation_df_GeneNumber = '%s/%s_GeneNumber.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_GeneNumber_pct = '%s/%s_GeneNumber_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_TotalDepth = '%s/%s_TotalDepth.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_TotalDepth_pct = '%s/%s_TotalDepth_pct.txt' % (
                output_folder, file_in_folder_name)

            # create output folder
            force_create_folder(output_folder)

            ######################################################### main #########################################################

            print(datetime.now().strftime(time_format) +
                  'Running dbCAN for %s input files with %s cores' %
                  (len(input_file_name_list), num_threads))

            list_for_multiple_arguments_dbCAN = []
            for input_file in input_file_name_list:

                input_file_basename = '.'.join(input_file.split('.')[:-1])
                pwd_input_file = '%s/%s' % (file_in, input_file)

                # get path to current depth file
                if depth_file is None:
                    input_file_depth = None
                else:
                    input_file_depth = '%s/%s.depth' % (depth_file,
                                                        input_file_basename)

                list_for_multiple_arguments_dbCAN.append([
                    pwd_input_file, pwd_hmmscan_parser, pwd_dbCAN_fam_HMMs,
                    sequence_type, output_folder, fam_to_activities_dict,
                    input_file_depth
                ])

            # run COG annotaion files with multiprocessing
            pool = mp.Pool(processes=num_threads)
            pool.map(dbCAN_worker, list_for_multiple_arguments_dbCAN)
            pool.close()
            pool.join()

            ######################################################### get dataframe #########################################################

            get_dbCAN_annot_df(output_folder,
                               annotation_df_GeneNumber,
                               annotation_df_GeneNumber_pct,
                               with_depth=False)
            if depth_file is not None:
                get_dbCAN_annot_df(output_folder,
                                   annotation_df_TotalDepth,
                                   annotation_df_TotalDepth_pct,
                                   with_depth=True)

            # report
            print(datetime.now().strftime(time_format) +
                  'Data matrix exported to:')
            print(datetime.now().strftime(time_format) +
                  annotation_df_GeneNumber.split('/')[-1])
            print(datetime.now().strftime(time_format) +
                  annotation_df_GeneNumber_pct.split('/')[-1])
            if depth_file is not None:
                print(datetime.now().strftime(time_format) +
                      annotation_df_TotalDepth.split('/')[-1])
                print(datetime.now().strftime(time_format) +
                      annotation_df_TotalDepth_pct.split('/')[-1])

    print(datetime.now().strftime(time_format) + 'Done!')
Пример #16
0
def COG2014(args):

    file_in = args['i']
    file_extension = args['x']
    sequence_type = args['m']
    depth_file = args['depth']
    pct_by_all = args['pct_by_all']
    DB_dir = args['db_dir']
    num_threads = args['t']
    run_diamond = args['diamond']
    evalue_cutoff = args['evalue']

    pwd_prot2003_2014 = '%s/prot2003-2014.fa' % DB_dir
    pwd_prot2003_2014_diamond = '%s/prot2003-2014.fa.dmnd' % DB_dir
    pwd_prot2003_2014_tab = '%s/prot2003-2014.tab' % DB_dir
    pwd_cog2003_2014 = '%s/cog2003-2014.csv' % DB_dir
    pwd_cognames2003_2014 = '%s/cognames2003-2014.tab' % DB_dir
    pwd_fun2003_2014 = '%s/fun2003-2014.tab' % DB_dir

    ############################################ check whether db file exist ###########################################

    # check whether db file exist
    unfound_inputs = []
    for each_input in [
            pwd_prot2003_2014, pwd_cognames2003_2014, pwd_fun2003_2014
    ]:
        if (not os.path.isfile(each_input)) and (
                not os.path.isdir(each_input)):
            unfound_inputs.append(each_input)
    if len(unfound_inputs) > 0:
        for each_unfound in unfound_inputs:
            print('%s not found' % each_unfound)
        exit()

    # check whether db index esixt
    if run_diamond is False:
        unfound_db_index_file = []
        for db_index in [
                'phr', 'pin', 'pnd', 'pni', 'pog', 'psd', 'psi', 'psq'
        ]:
            pwd_db_index = '%s/prot2003-2014.fa.%s' % (DB_dir, db_index)
            if not os.path.isfile(pwd_db_index):
                unfound_db_index_file.append(db_index)
        if len(unfound_db_index_file) > 0:
            print(datetime.now().strftime(time_format) +
                  'DB index files not found, runing makeblastdb first')
            makeblastdb_cmd = 'makeblastdb -in %s -dbtype prot -parse_seqids -logfile %s.log' % (
                pwd_prot2003_2014, pwd_prot2003_2014)
            os.system(makeblastdb_cmd)
            print(datetime.now().strftime(time_format) +
                  'makeblastdb finished')

    if run_diamond is True:
        if os.path.isfile(pwd_prot2003_2014_diamond) is False:
            print(datetime.now().strftime(time_format) +
                  'DB file not found, making diamond db')
            diamond_makedb_cmd = 'diamond makedb --in %s --db %s --quiet' % (
                pwd_prot2003_2014, pwd_prot2003_2014_diamond)
            os.system(diamond_makedb_cmd)

    ################################################# read db into dict ################################################

    # get refseq_acc_to_protein_id_dict (prot2003-2014.tab)
    refseq_acc_to_protein_id_dict = {}
    for refseq_acc in open(pwd_prot2003_2014_tab):
        refseq_acc_split = refseq_acc.strip().split('\t')
        refseq_acc_to_protein_id_dict[
            refseq_acc_split[1]] = refseq_acc_split[0]

    # get protein_id_to_cog_id_dict (cog2003-2014.csv)
    protein_id_to_cog_id_dict = {}
    for protein_to_cog in open(pwd_cog2003_2014):
        protein_to_cog_split = protein_to_cog.strip().split(',')
        protein_id = protein_to_cog_split[2]
        cog_id = protein_to_cog_split[6]
        protein_id_to_cog_id_dict[protein_id] = cog_id

    # get cog_id_to_category_dict and cog_id_to_description_dict (cognames2003-2014.tab)
    cog_id_to_category_dict = {}
    cog_id_to_description_dict = {}
    for cog_id_to_cate_des in open(pwd_cognames2003_2014,
                                   encoding='windows-1252'):
        if not cog_id_to_cate_des.startswith('#'):
            cog_id_to_cate_des_split = cog_id_to_cate_des.strip().split('\t')
            cog_id = cog_id_to_cate_des_split[0]
            cog_cate = cog_id_to_cate_des_split[1]
            cog_des = cog_id_to_cate_des_split[2]
            cog_id_to_category_dict[cog_id] = cog_cate
            cog_id_to_description_dict[cog_id] = cog_des

    # get cog_category_to_description_dict (fun2003-2014.tab)
    cog_category_list = []
    cog_category_to_description_dict = {}
    for cog_category in open(pwd_fun2003_2014):
        if not cog_category.startswith('#'):
            cog_category_split = cog_category.strip().split('\t')
            cog_category_list.append(cog_category_split[0])
            cog_category_to_description_dict[
                cog_category_split[0]] = cog_category_split[1]

    ################################################## if input is file ################################################

    # if input is file
    if os.path.isfile(file_in) is True:

        # check whether depth file exist
        if depth_file is not None:
            if os.path.isfile(depth_file) is False:
                print(datetime.now().strftime(time_format) +
                      'specified depth file not found, program exited!')
                exit()

        print(datetime.now().strftime(time_format) +
              'Running COG annotation for 1 file with %s cores' %
              (num_threads))

        file_in_path, file_in_basename, file_in_ext = sep_path_basename_ext(
            file_in)

        COG2014_worker([
            file_in, pwd_prot2003_2014, refseq_acc_to_protein_id_dict,
            protein_id_to_cog_id_dict, cog_id_to_category_dict,
            cog_id_to_description_dict, cog_category_list,
            cog_category_to_description_dict, sequence_type, file_in_path,
            num_threads, run_diamond, evalue_cutoff, depth_file, pct_by_all
        ])

    ################################################ if input is folder ################################################

    # if input is folder
    else:

        # check whether input folder exist
        if os.path.isdir(file_in) is False:
            print(datetime.now().strftime(time_format) +
                  'input folder not found, program exited')
            exit()

        else:
            # check whether input genome exist
            input_file_re = '%s/*.%s' % (file_in, file_extension)
            input_file_name_list = [
                os.path.basename(file_name)
                for file_name in glob.glob(input_file_re)
            ]

            if len(input_file_name_list) == 0:
                print(datetime.now().strftime(time_format) +
                      'input file not found, program exited')
                exit()

            # check whether depth file exist
            if depth_file is not None:

                if os.path.isfile(depth_file) is True:
                    print(
                        datetime.now().strftime(time_format) +
                        'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.'
                    )
                    print(
                        datetime.now().strftime(time_format) +
                        'a single file (not folder) detected, program exited!')
                    exit()

                if os.path.isdir(depth_file) is False:
                    print(datetime.now().strftime(time_format) +
                          'provided depth folder not found, program exited!')
                    exit()

                if os.path.isdir(depth_file) is True:

                    undetected_depth_file = []
                    for input_seq_file in input_file_name_list:
                        input_seq_file_basename = '.'.join(
                            input_seq_file.split('.')[:-1])
                        input_seq_file_depth = '%s/%s.depth' % (
                            depth_file, input_seq_file_basename)
                        if os.path.isfile(input_seq_file_depth) is False:
                            undetected_depth_file.append(input_seq_file_depth)

                    if len(undetected_depth_file) > 0:
                        print(
                            datetime.now().strftime(time_format) +
                            'the following depth files not found, program exited!'
                        )
                        print(','.join(undetected_depth_file))
                        exit()

            ################################################### define file name ###################################################

            if '/' in file_in:
                file_in_folder_name = file_in.split('/')[-1]
            else:
                file_in_folder_name = file_in

            output_folder = '%s_COG2014_wd' % file_in_folder_name

            # create output folder
            force_create_folder(output_folder)

            ######################################################### main #########################################################

            print(datetime.now().strftime(time_format) +
                  'Running COG annotation for %s files with %s cores' %
                  (len(input_file_name_list), num_threads))

            list_for_multiple_arguments_COG = []
            for input_file in input_file_name_list:

                input_file_basename = '.'.join(input_file.split('.')[:-1])
                pwd_input_file = '%s/%s' % (file_in, input_file)

                # get path to current depth file
                if depth_file is None:
                    input_file_depth = None
                else:
                    input_file_depth = '%s/%s.depth' % (depth_file,
                                                        input_file_basename)

                list_for_multiple_arguments_COG.append([
                    pwd_input_file, pwd_prot2003_2014,
                    refseq_acc_to_protein_id_dict, protein_id_to_cog_id_dict,
                    cog_id_to_category_dict, cog_id_to_description_dict,
                    cog_category_list, cog_category_to_description_dict,
                    sequence_type, output_folder, 1, run_diamond,
                    evalue_cutoff, input_file_depth, pct_by_all
                ])

            # run COG annotaion files with multiprocessing
            pool = mp.Pool(processes=num_threads)
            pool.map(COG2014_worker, list_for_multiple_arguments_COG)
            pool.close()
            pool.join()

            ######################################################### get dataframe #########################################################

            annotation_df_cog_cate_GeneNumber = '%s/%s_COG2014_cate_GeneNumber.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_GeneNumber_pct = '%s/%s_COG2014_cate_GeneNumber_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_GeneNumber_pct_by_all = '%s/%s_COG2014_cate_GeneNumber_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_cate_TotalDepth = '%s/%s_COG2014_cate_TotalDepth.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_TotalDepth_pct = '%s/%s_COG2014_cate_TotalDepth_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_TotalDepth_pct_by_all = '%s/%s_COG2014_cate_TotalDepth_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_id_GeneNumber = '%s/%s_COG2014_id_GeneNumber.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_GeneNumber_pct = '%s/%s_COG2014_id_GeneNumber_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_GeneNumber_pct_by_all = '%s/%s_COG2014_id_GeneNumber_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_id_TotalDepth = '%s/%s_COG2014_id_TotalDepth.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_TotalDepth_pct = '%s/%s_COG2014_id_TotalDepth_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_TotalDepth_pct_by_all = '%s/%s_COG2014_id_TotalDepth_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            print(datetime.now().strftime(time_format) +
                  'Data matrix exported to:')

            # get df
            get_COG_annot_df(output_folder,
                             'cog_cate',
                             annotation_df_cog_cate_GeneNumber,
                             annotation_df_cog_cate_GeneNumber_pct,
                             annotation_df_cog_cate_GeneNumber_pct_by_all,
                             with_depth=False,
                             pct_by_all=False)
            get_COG_annot_df(output_folder,
                             'cog_id',
                             annotation_df_cog_id_GeneNumber,
                             annotation_df_cog_id_GeneNumber_pct,
                             annotation_df_cog_id_GeneNumber_pct_by_all,
                             with_depth=False,
                             pct_by_all=False)
            if pct_by_all is True:
                get_COG_annot_df(output_folder,
                                 'cog_cate',
                                 annotation_df_cog_cate_GeneNumber,
                                 annotation_df_cog_cate_GeneNumber_pct,
                                 annotation_df_cog_cate_GeneNumber_pct_by_all,
                                 with_depth=False,
                                 pct_by_all=True)
                get_COG_annot_df(output_folder,
                                 'cog_id',
                                 annotation_df_cog_id_GeneNumber,
                                 annotation_df_cog_id_GeneNumber_pct,
                                 annotation_df_cog_id_GeneNumber_pct_by_all,
                                 with_depth=False,
                                 pct_by_all=True)

            # report
            if pct_by_all is False:
                print(datetime.now().strftime(time_format) + '%s and %s' %
                      (annotation_df_cog_id_GeneNumber.split('/')[-1],
                       annotation_df_cog_id_GeneNumber_pct.split('/')[-1]))
                print(datetime.now().strftime(time_format) + '%s and %s' %
                      (annotation_df_cog_cate_GeneNumber.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct.split('/')[-1]))
            else:
                print(
                    datetime.now().strftime(time_format) + '%s, %s and %s' %
                    (annotation_df_cog_id_GeneNumber.split('/')[-1],
                     annotation_df_cog_id_GeneNumber_pct.split('/')[-1],
                     annotation_df_cog_id_GeneNumber_pct_by_all.split('/')[-1])
                )
                print(datetime.now().strftime(time_format) + '%s, %s and %s' %
                      (annotation_df_cog_cate_GeneNumber.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct_by_all.split('/')
                       [-1]))

            if depth_file is not None:
                get_COG_annot_df(output_folder,
                                 'cog_cate',
                                 annotation_df_cog_cate_TotalDepth,
                                 annotation_df_cog_cate_TotalDepth_pct,
                                 annotation_df_cog_cate_TotalDepth_pct_by_all,
                                 with_depth=True,
                                 pct_by_all=False)
                get_COG_annot_df(output_folder,
                                 'cog_id',
                                 annotation_df_cog_id_TotalDepth,
                                 annotation_df_cog_id_TotalDepth_pct,
                                 annotation_df_cog_id_TotalDepth_pct_by_all,
                                 with_depth=True,
                                 pct_by_all=False)
                if pct_by_all is True:
                    get_COG_annot_df(
                        output_folder,
                        'cog_cate',
                        annotation_df_cog_cate_TotalDepth,
                        annotation_df_cog_cate_TotalDepth_pct,
                        annotation_df_cog_cate_TotalDepth_pct_by_all,
                        with_depth=True,
                        pct_by_all=True)
                    get_COG_annot_df(
                        output_folder,
                        'cog_id',
                        annotation_df_cog_id_TotalDepth,
                        annotation_df_cog_id_TotalDepth_pct,
                        annotation_df_cog_id_TotalDepth_pct_by_all,
                        with_depth=True,
                        pct_by_all=True)

                # report
                if pct_by_all is False:
                    print(datetime.now().strftime(time_format) + '%s and %s' %
                          (annotation_df_cog_id_TotalDepth.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct.split('/')[-1]))
                    print(
                        datetime.now().strftime(time_format) + '%s and %s' %
                        (annotation_df_cog_cate_TotalDepth.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct.split('/')[-1]))
                else:
                    print(datetime.now().strftime(time_format) +
                          '%s, %s and %s' %
                          (annotation_df_cog_id_TotalDepth.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct_by_all.split(
                               '/')[-1]))
                    print(
                        datetime.now().strftime(time_format) +
                        '%s, %s and %s' %
                        (annotation_df_cog_cate_TotalDepth.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct_by_all.split(
                             '/')[-1]))

    ################################################## Final report ####################################################

    print(datetime.now().strftime(time_format) + 'Done!')
Пример #17
0
def rename_seq(args):

    seq_file_in = args['in']
    file_extension = args['x']
    sep_in = args['sep_in']
    sep_out = args['sep_out']
    column_to_keep = args['n']
    add_prefix = args['prefix']
    one_line = args['oneline']
    num_threads = args['t']

    if os.path.isfile(seq_file_in) is True:
        ctg_file_path, ctg_file_basename, ctg_file_ext = sep_path_basename_ext(
            seq_file_in)
        seq_file_out = '%s/%s_renamed%s' % (ctg_file_path, ctg_file_basename,
                                            ctg_file_ext)
        if os.path.isfile(seq_file_in) is True:
            print('Output file detected, program exited: %s' % seq_file_out)
            exit()
        else:
            rename_seq_worker([
                seq_file_in, seq_file_out, sep_in, sep_out, column_to_keep,
                add_prefix, one_line
            ])

    if os.path.isdir(seq_file_in) is True:

        if seq_file_in[-1] == '/':
            seq_file_in = seq_file_in[:-1]

        seq_in_folder_no_path = seq_file_in.split('/')[-1]
        seq_out_folder = '%s_renamed' % seq_in_folder_no_path

        seq_in_re = '%s/*.%s' % (seq_file_in, file_extension)
        seq_in_list = [
            os.path.basename(file_name) for file_name in glob.glob(seq_in_re)
        ]
        if len(seq_in_list) == 0:
            print('No sequence file detected, program exited!')
            exit()

        if os.path.isdir(seq_out_folder) is True:
            print('Output folder detected, program exited: %s' %
                  seq_out_folder)
            exit()
        else:
            os.mkdir(seq_out_folder)

        prefix_dict = {}
        if add_prefix is not None:
            if os.path.isfile(add_prefix) is False:
                print('Prefix file not detected, program exited!')
                exit()
            else:
                # read in prefix
                for each_genome in open(add_prefix):
                    each_genome_split = each_genome.strip().split('\t')
                    prefix_dict[each_genome_split[0]] = each_genome_split[1]

                genome_without_prefix = set()
                for each_seq_file in seq_in_list:
                    if each_seq_file not in prefix_dict:
                        genome_without_prefix.add(each_seq_file)
                if len(genome_without_prefix) > 0:
                    print(
                        'Prefix for the following files not found, , program exited!'
                    )
                    print(','.join(genome_without_prefix))
                    exit()

        argument_lol_for_rename_seq_worker = []
        for each_seq_file in seq_in_list:
            pwd_seq_in = '%s/%s' % (seq_file_in, each_seq_file)
            pwd_seq_out = '%s/%s' % (seq_out_folder, each_seq_file)
            current_argument_list = [
                pwd_seq_in, pwd_seq_out, sep_in, sep_out, column_to_keep,
                prefix_dict.get(each_seq_file, None), one_line
            ]
            argument_lol_for_rename_seq_worker.append(current_argument_list)

        # rename sequence files with multiprocessing
        pool = mp.Pool(processes=num_threads)
        pool.map(rename_seq_worker, argument_lol_for_rename_seq_worker)
        pool.close()
        pool.join()

    print('Done!')
Пример #18
0
def COG2020_worker(argument_list):

    pwd_input_file = argument_list[0]
    pwd_prot2003_2014 = argument_list[1]
    protein_id_to_cog_id_dict = argument_list[2]
    cog_id_to_category_dict = argument_list[3]
    cog_id_to_description_dict = argument_list[4]
    cog_category_list = argument_list[5]
    cog_category_to_description_dict = argument_list[6]
    sequence_type = argument_list[7]
    output_folder = argument_list[8]
    thread_num = argument_list[9]
    run_diamond = argument_list[10]
    evalue_cutoff = argument_list[11]
    depth_file = argument_list[12]
    pct_by_all = argument_list[13]

    input_seq_no_path, input_seq_no_ext, input_seq_ext = sep_path_basename_ext(
        pwd_input_file)
    current_output_folder = '%s/%s_COG2020_wd' % (output_folder,
                                                  input_seq_no_ext)

    pwd_blastp_output = '%s/%s_blastp.tab' % (current_output_folder,
                                              input_seq_no_ext)
    pwd_blastp_output_besthits = '%s/%s_blastp_besthits.tab' % (
        current_output_folder, input_seq_no_ext)
    pwd_query_to_cog_txt = '%s/%s_query_to_cog.txt' % (current_output_folder,
                                                       input_seq_no_ext)

    pwd_cog_stats_GeneNumber = '%s/%s_cog_stats_GeneNumber.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_cog_stats_TotalDepth = '%s/%s_cog_stats_TotalDepth.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_GeneNumber = '%s/%s_func_stats_GeneNumber.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_TotalDepth = '%s/%s_func_stats_TotalDepth.txt' % (
        current_output_folder, input_seq_no_ext)

    pwd_cog_stats_GeneNumber_pct = '%s/%s_cog_stats_GeneNumber_pct.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_cog_stats_TotalDepth_pct = '%s/%s_cog_stats_TotalDepth_pct.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_GeneNumber_pct = '%s/%s_func_stats_GeneNumber_pct.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_TotalDepth_pct = '%s/%s_func_stats_TotalDepth_pct.txt' % (
        current_output_folder, input_seq_no_ext)

    pwd_cog_stats_GeneNumber_pct_by_all = '%s/%s_cog_stats_GeneNumber_pct_by_all.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_cog_stats_TotalDepth_pct_by_all = '%s/%s_cog_stats_TotalDepth_pct_by_all.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_GeneNumber_pct_by_all = '%s/%s_func_stats_GeneNumber_pct_by_all.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_TotalDepth_pct_by_all = '%s/%s_func_stats_TotalDepth_pct_by_all.txt' % (
        current_output_folder, input_seq_no_ext)

    force_create_folder(current_output_folder)

    input_seq_aa = ''
    if sequence_type in ['N', 'n']:
        input_seq_aa = '%s_aa.fasta' % input_seq_no_ext
        dna2aa(pwd_input_file, input_seq_aa)
    elif sequence_type in ['P', 'p']:
        input_seq_aa = pwd_input_file
    else:
        print('Specified input sequence type unrecognizable, program exited!')
        exit()

    # run blastp
    if run_diamond is False:
        os.system(
            'blastp -query %s -db %s -out %s -evalue %s -outfmt 6 -show_gis -num_threads %s'
            % (input_seq_aa, pwd_prot2003_2014, pwd_blastp_output,
               evalue_cutoff, thread_num))
    else:
        os.system(
            'diamond blastp -q %s --db %s.dmnd --out %s --evalue %s --outfmt 6 --threads %s --quiet'
            % (input_seq_aa, pwd_prot2003_2014, pwd_blastp_output,
               evalue_cutoff, thread_num))

    # keep only best hits
    best_hit({'i': pwd_blastp_output, 'o': pwd_blastp_output_besthits})

    # get query_to_ref_protein_dict
    query_to_ref_protein_dict = {}
    for each_hit in open(pwd_blastp_output_besthits):
        each_hit_split = each_hit.strip().split('\t')
        each_hit_query = each_hit_split[0]
        each_hit_subject = each_hit_split[1]
        each_hit_subject_no_dot = '_'.join(each_hit_subject.split('.'))
        query_to_ref_protein_dict[each_hit_query] = each_hit_subject_no_dot

    # get query sequences list
    query_seq_list = []
    for query_seq in SeqIO.parse(pwd_input_file, 'fasta'):
        query_seq_list.append(query_seq.id)

    # export annotation
    cog_id_num_dict = {}
    cog_id_to_gene_member_dict = {}
    cog_cate_num_dict = {}
    cog_cate_to_gene_member_dict = {}
    genes_with_cog = set()
    pwd_query_to_cog_txt_handle = open(pwd_query_to_cog_txt, 'w')
    pwd_query_to_cog_txt_handle.write('Query\tCOG\tCategory\tDescription\n')
    for query_gene in sorted(query_seq_list):

        if query_gene not in query_to_ref_protein_dict:
            pwd_query_to_cog_txt_handle.write('%s\n' % (query_gene))

        else:
            db_protein_id = query_to_ref_protein_dict[query_gene]
            if db_protein_id not in protein_id_to_cog_id_dict:
                pwd_query_to_cog_txt_handle.write('%s\n' % (query_gene))

            else:
                cog_id_list = protein_id_to_cog_id_dict[db_protein_id]
                for cog_id in cog_id_list:
                    cog_cate = cog_id_to_category_dict[cog_id]
                    cog_des = cog_id_to_description_dict[cog_id]
                    pwd_query_to_cog_txt_handle.write(
                        '%s\t%s\t%s\t%s\n' %
                        (query_gene, cog_id, cog_cate, cog_des))
                    genes_with_cog.add(query_gene)

                    # update cog_id_num_dict
                    if cog_id not in cog_id_num_dict:
                        cog_id_num_dict[cog_id] = 1
                        cog_id_to_gene_member_dict[cog_id] = [query_gene]
                    else:
                        cog_id_num_dict[cog_id] += 1
                        cog_id_to_gene_member_dict[cog_id].append(query_gene)

                    # update cog_cate_num_dict
                    for each_cog_cate in cog_cate:
                        if each_cog_cate not in cog_cate_num_dict:
                            cog_cate_num_dict[each_cog_cate] = 1
                            cog_cate_to_gene_member_dict[each_cog_cate] = [
                                query_gene
                            ]
                        else:
                            cog_cate_num_dict[each_cog_cate] += 1
                            cog_cate_to_gene_member_dict[each_cog_cate].append(
                                query_gene)

    pwd_query_to_cog_txt_handle.close()

    # read in depth info
    gene_depth_dict = {}
    if depth_file is not None:
        for each_depth in open(depth_file):
            each_depth_split = each_depth.strip().split('\t')
            gene_depth_dict[each_depth_split[0]] = float(each_depth_split[1])

    # get TotalDepth of all query genes or genes with cog assignment
    if depth_file is not None:
        genes_with_cog_TotalDepth = get_gene_list_TotalDepth(
            genes_with_cog, gene_depth_dict)
        total_depth_for_all_query_genes = get_gene_list_TotalDepth(
            query_seq_list, gene_depth_dict)

    #################### export cog_stats_GeneNumber ####################

    pwd_cog_stats_GeneNumber_handle = open(pwd_cog_stats_GeneNumber, 'w')
    pwd_cog_stats_GeneNumber_handle.write('COG\tGeneNumber\tDescription\n')
    for each_cog_id in cog_id_num_dict:
        each_cog_id_GeneNumber = cog_id_num_dict[each_cog_id]
        pwd_cog_stats_GeneNumber_handle.write(
            '%s\t%s\t%s\n' % (each_cog_id, each_cog_id_GeneNumber,
                              cog_id_to_description_dict[each_cog_id]))
    pwd_cog_stats_GeneNumber_handle.close()

    #################### export cog_stats_TotalDepth ####################

    if depth_file is not None:
        pwd_cog_stats_TotalDepth_handle = open(pwd_cog_stats_TotalDepth, 'w')
        pwd_cog_stats_TotalDepth_handle.write('COG\tTotalDepth\tDescription\n')
        for each_cog_id in cog_id_to_gene_member_dict:
            each_cog_id_gene_member = cog_id_to_gene_member_dict[each_cog_id]
            each_cog_id_TotalDepth = 0
            for each_gene in each_cog_id_gene_member:
                each_gene_depth = gene_depth_dict[each_gene]
                each_cog_id_TotalDepth += each_gene_depth
            each_cog_id_TotalDepth = float(
                "{0:.2f}".format(each_cog_id_TotalDepth))
            pwd_cog_stats_TotalDepth_handle.write(
                '%s\t%s\t%s\n' % (each_cog_id, each_cog_id_TotalDepth,
                                  cog_id_to_description_dict[each_cog_id]))
        pwd_cog_stats_TotalDepth_handle.close()

    #################### export func_stats_GeneNumber ####################

    pwd_func_stats_GeneNumber_handle = open(pwd_func_stats_GeneNumber, 'w')
    pwd_func_stats_GeneNumber_handle.write(
        'Category\tGeneNumber\tDescription\n')
    for each_cog_cate in cog_category_list:
        each_cog_cate_GeneNumber = 0
        if each_cog_cate in cog_cate_num_dict:
            each_cog_cate_GeneNumber = cog_cate_num_dict[each_cog_cate]
        pwd_func_stats_GeneNumber_handle.write(
            '%s\t%s\t%s\n' % (each_cog_cate, each_cog_cate_GeneNumber,
                              cog_category_to_description_dict[each_cog_cate]))
    pwd_func_stats_GeneNumber_handle.close()

    #################### export func_stats_TotalDepth ####################

    if depth_file is not None:
        pwd_func_stats_TotalDepth_handle = open(pwd_func_stats_TotalDepth, 'w')
        pwd_func_stats_TotalDepth_handle.write(
            'Category\tTotalDepth\tDescription\n')
        for each_cog_cate in cog_category_list:
            each_cog_cate_TotalDepth = 0
            if each_cog_cate in cog_cate_to_gene_member_dict:
                each_cog_cate_gene_member = cog_cate_to_gene_member_dict[
                    each_cog_cate]
                for each_gene in each_cog_cate_gene_member:
                    each_gene_depth = gene_depth_dict[each_gene]
                    each_cog_cate_TotalDepth += each_gene_depth
            each_cog_cate_TotalDepth = float(
                "{0:.2f}".format(each_cog_cate_TotalDepth))
            pwd_func_stats_TotalDepth_handle.write(
                '%s\t%s\t%s\n' %
                (each_cog_cate, each_cog_cate_TotalDepth,
                 cog_category_to_description_dict[each_cog_cate]))
        pwd_func_stats_TotalDepth_handle.close()

    #################### get pct files ####################

    AnnotateNorm(file_in=pwd_cog_stats_GeneNumber,
                 skip_header=True,
                 value_column=2,
                 Divisor_value=len(genes_with_cog),
                 file_out=pwd_cog_stats_GeneNumber_pct,
                 file_out_header='Category\tGeneNumber_pct\tDescription\n')
    AnnotateNorm(file_in=pwd_func_stats_GeneNumber,
                 skip_header=True,
                 value_column=2,
                 Divisor_value=len(genes_with_cog),
                 file_out=pwd_func_stats_GeneNumber_pct,
                 file_out_header='Category\tGeneNumber_pct\tDescription\n')
    if depth_file is not None:
        AnnotateNorm(file_in=pwd_cog_stats_TotalDepth,
                     skip_header=True,
                     value_column=2,
                     Divisor_value=genes_with_cog_TotalDepth,
                     file_out=pwd_cog_stats_TotalDepth_pct,
                     file_out_header='Category\tTotalDepth_pct\tDescription\n')
        AnnotateNorm(file_in=pwd_func_stats_TotalDepth,
                     skip_header=True,
                     value_column=2,
                     Divisor_value=genes_with_cog_TotalDepth,
                     file_out=pwd_func_stats_TotalDepth_pct,
                     file_out_header='Category\tTotalDepth_pct\tDescription\n')
    if pct_by_all is True:
        AnnotateNorm(
            file_in=pwd_cog_stats_GeneNumber,
            skip_header=True,
            value_column=2,
            Divisor_value=len(query_seq_list),
            file_out=pwd_cog_stats_GeneNumber_pct_by_all,
            file_out_header='Category\tGeneNumber_pct_by_all\tDescription\n')
        AnnotateNorm(
            file_in=pwd_func_stats_GeneNumber,
            skip_header=True,
            value_column=2,
            Divisor_value=len(query_seq_list),
            file_out=pwd_func_stats_GeneNumber_pct_by_all,
            file_out_header='Category\tGeneNumber_pct_by_all\tDescription\n')
        if depth_file is not None:
            AnnotateNorm(
                file_in=pwd_cog_stats_TotalDepth,
                skip_header=True,
                value_column=2,
                Divisor_value=total_depth_for_all_query_genes,
                file_out=pwd_cog_stats_TotalDepth_pct_by_all,
                file_out_header='Category\tTotalDepth_pct_by_all\tDescription\n'
            )
            AnnotateNorm(
                file_in=pwd_func_stats_TotalDepth,
                skip_header=True,
                value_column=2,
                Divisor_value=total_depth_for_all_query_genes,
                file_out=pwd_func_stats_TotalDepth_pct_by_all,
                file_out_header='Category\tTotalDepth_pct_by_all\tDescription\n'
            )
Пример #19
0
def BLCA_op_parser(args):

    blca_output = args['in']

    file_in_path, file_in_basename, file_in_ext = sep_path_basename_ext(
        blca_output)
    output_file_1 = '%s/%s_reformatted_1.txt' % (file_in_path,
                                                 file_in_basename)
    output_file_2 = '%s/%s_reformatted_2.txt' % (file_in_path,
                                                 file_in_basename)

    # read in input file
    s16_taxon_blca_dict = {}
    for each_16s_taxon in open(blca_output):
        each_16s_taxon_split = each_16s_taxon.strip().split('\t')
        s16_taxon_blca_dict[each_16s_taxon_split[0]] = each_16s_taxon_split[1]

    taxon_dict_formatted_with_num = {}
    taxon_dict_formatted_no_num = {}
    for each_16s in s16_taxon_blca_dict:
        taxon_blca_raw = s16_taxon_blca_dict[each_16s]
        formatted_taxon_str_with_num = 'Unclassified'
        formatted_taxon_str_no_num = 'Unclassified'
        if taxon_blca_raw != 'Unclassified':
            taxon_blca_raw_split_1 = taxon_blca_raw.strip().split(':')[1:]
            formatted_taxon_list_with_num = []
            formatted_taxon_list_no_num = []
            for each_str in taxon_blca_raw_split_1:
                each_str_split = each_str.split(';')

                # determine_current_rank
                current_rank = ''
                if each_str_split[-1] == 'phylum':
                    current_rank = 'd'
                elif each_str_split[-1] == 'class':
                    current_rank = 'p'
                elif each_str_split[-1] == 'order':
                    current_rank = 'c'
                elif each_str_split[-1] == 'family':
                    current_rank = 'o'
                elif each_str_split[-1] == 'genus':
                    current_rank = 'f'
                elif each_str_split[-1] == 'species':
                    current_rank = 'g'
                elif each_str_split[-1] == '':
                    current_rank = 's'

                taxon_with_confidence = '%s(%s)' % (each_str_split[0],
                                                    each_str_split[1][:5])
                taxon_without_confidence = '%s__%s' % (current_rank,
                                                       each_str_split[0])

                formatted_taxon_list_with_num.append(taxon_with_confidence)
                formatted_taxon_list_no_num.append(taxon_without_confidence)

            formatted_taxon_str_with_num = ';'.join(
                formatted_taxon_list_with_num)
            formatted_taxon_str_no_num = ';'.join(formatted_taxon_list_no_num)

        formatted_taxon_str_with_numno_space = '_'.join(
            formatted_taxon_str_with_num.split(' '))
        formatted_taxon_str_no_num_no_space = '_'.join(
            formatted_taxon_str_no_num.split(' '))

        taxon_dict_formatted_with_num[
            each_16s] = formatted_taxon_str_with_numno_space
        taxon_dict_formatted_no_num[
            each_16s] = formatted_taxon_str_no_num_no_space

    output_file_1_handle = open(output_file_1, 'w')
    output_file_2_handle = open(output_file_2, 'w')
    for each_seq in taxon_dict_formatted_with_num:
        output_file_1_handle.write(
            '%s\t%s\n' % (each_seq, taxon_dict_formatted_with_num[each_seq]))
        output_file_2_handle.write(
            '%s\t%s\n' % (each_seq, taxon_dict_formatted_no_num[each_seq]))
    output_file_1_handle.close()
    output_file_2_handle.close()