示例#1
0
def run_vsearch_on_reads(se_fastq_files, pe_fastq_files, vsearch,
                         dir_vsearch_results_fa_trim, fpatt, ss, seqtk):

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        if vsearch is None:
            Log.err('vsearch is not available. Cannot continue. Exiting.')
            exit(0)
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    # FixMe: Expose in configuration files?
    ident = 0.85

    for se in se_fastq_files:
        dir_results = opj(dir_vsearch_results_fa_trim, se)
        min_acc_len = se_fastq_files[se]['min_acc_len']
        blast_results_fa_path = se_fastq_files[se]['blast_results_path' +
                                                   '__' + ss]
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_results, se + '__' + ss + '.txt')
        out_f_fastq = out_f.replace('.txt', '.fastq')
        se_fastq_files[se]['vsearch_results_path' + '__' + ss] = out_f_fastq

        if ope(out_f_fastq):
            Log.msg('Vsearch results already exist:', se)
        else:
            make_dirs(dir_results)
            Log.msg('Running vsearch on: ' + basename(fq_path), ss)
            run_vsearch(vsearch,
                        ident=ident,
                        q_file=blast_results_fa_path,
                        db_file=fq_path,
                        out_file=out_f,
                        minlen=min_acc_len)

            Log.msg('Extracting unique vsearch hits using Seqtk:', ss)
            keep_unique_lines_in_file(out_f)
            seqtk_extract_reads(seqtk, fq_path, out_f_fastq, out_f)
            osremove(out_f)

    for pe in pe_fastq_files:
        dir_results = opj(dir_vsearch_results_fa_trim, pe)
        min_acc_len = pe_fastq_files[pe]['min_acc_len']
        blast_results_fa_path = pe_fastq_files[pe]['blast_results_path' +
                                                   '__' + ss]
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_results) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x.replace('@Q@', ss) for x in out_fs]
        out_fs_fastq = [x.replace('.txt', '.fastq') for x in out_fs]
        pe_fastq_files[pe]['vsearch_results_path' + '__' + ss] = out_fs_fastq

        if ope(out_fs_fastq[0]) and ope(out_fs_fastq[1]) and \
           ope(out_fs_fastq[2]) and ope(out_fs_fastq[3]):
            Log.msg('Vsearch results already exist:', pe)
        else:
            make_dirs(dir_results)
            pe_trim_files = zip(fq_paths, out_fs, out_fs_fastq)
            for x in pe_trim_files:
                Log.msg('Running vsearch on: ' + basename(x[0]), ss)
                run_vsearch(vsearch,
                            ident=ident,
                            q_file=blast_results_fa_path,
                            db_file=x[0],
                            out_file=x[1],
                            minlen=min_acc_len)

            Log.msg(
                'Extracting unique vsearch hits from paired files '
                'using Seqtk:', ss)

            p1txt = out_fs[0]
            p2txt = out_fs[1]

            p1fq = fq_paths[0]
            p2fq = fq_paths[1]

            p1fq_out = out_fs_fastq[0]
            p2fq_out = out_fs_fastq[1]

            p12txt_temp = opj(dir_results, pe + '__' + ss + '_paired.txt')

            combine_text_files([p1txt, p2txt], p12txt_temp)
            keep_unique_lines_in_file(p12txt_temp)

            seqtk_extract_reads(seqtk, p1fq, p1fq_out, p12txt_temp)
            seqtk_extract_reads(seqtk, p2fq, p2fq_out, p12txt_temp)

            osremove(p1txt)
            osremove(p2txt)
            osremove(p12txt_temp)

            Log.msg(
                'Extracting unique vsearch hits from unpaired files '
                'using Seqtk:', ss)

            u1txt = out_fs[2]
            u2txt = out_fs[3]

            u1fq = fq_paths[2]
            u2fq = fq_paths[3]

            u1fq_out = out_fs_fastq[2]
            u2fq_out = out_fs_fastq[3]

            keep_unique_lines_in_file(u1txt)
            keep_unique_lines_in_file(u2txt)

            seqtk_extract_reads(seqtk, u1fq, u1fq_out, u1txt)
            seqtk_extract_reads(seqtk, u2fq, u2fq_out, u2txt)

            osremove(u1txt)
            osremove(u2txt)
示例#2
0
def run_tblastn_on_reads(se_fastq_files, pe_fastq_files, aa_queries_file,
                         tblastn, blast_1_evalue, blast_1_max_hsps,
                         blast_1_qcov_hsp_perc, blast_1_best_hit_overhang,
                         blast_1_best_hit_score_edge, blast_1_max_target_seqs,
                         dir_blast_results_fa_trim, fpatt, ss, threads, seqtk,
                         vsearch, dir_cache_prj):

    changed_blast_1 = False

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Running BLAST on reads:', ss)
        if tblastn is None:
            Log.err('tblastn is not available. Cannot continue. Exiting.')
            exit(0)

        if vsearch is None:
            Log.err('vsearch is not available. Cannot continue. Exiting.')
            exit(0)

        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    cache_file = opj(dir_cache_prj, 'blast_1_settings_cache__' + ss)

    pickled = dict()
    settings = {
        'blast_1_evalue': blast_1_evalue,
        'blast_1_max_hsps': blast_1_max_hsps,
        'blast_1_qcov_hsp_perc': blast_1_qcov_hsp_perc,
        'blast_1_best_hit_overhang': blast_1_best_hit_overhang,
        'blast_1_best_hit_score_edge': blast_1_best_hit_score_edge,
        'blast_1_max_target_seqs': blast_1_max_target_seqs,
        'queries': seq_records_to_dict(read_fasta(aa_queries_file,
                                                  SEQ_TYPE_AA))
    }

    Log.msg('evalue:', str(blast_1_evalue))
    Log.msg('max_hsps:', str(blast_1_max_hsps))
    Log.msg('qcov_hsp_perc:', str(blast_1_qcov_hsp_perc))
    Log.msg('best_hit_overhang:', str(blast_1_best_hit_overhang))
    Log.msg('best_hit_score_edge:', str(blast_1_best_hit_score_edge))
    Log.msg('max_target_seqs:', str(blast_1_max_target_seqs))
    print()

    # FixMe: Expose in configuration files?
    ident = 0.85

    for se in se_fastq_files:
        dir_results = opj(dir_blast_results_fa_trim, se)
        blast_db_path = se_fastq_files[se]['blast_db_path']
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_results, se + '__' + ss + '.txt')
        out_f_fastq = out_f.replace('.txt', '.fastq')
        out_f_fasta = out_f.replace('.txt', '.fasta')
        se_fastq_files[se]['blast_results_path' + '__' + ss] = out_f_fasta
        genetic_code = se_fastq_files[se]['gc_id']

        if ope(out_f_fasta) and ope(cache_file):
            with open(cache_file, 'rb') as f:
                pickled = pickle.load(f)

        if ope(out_f_fasta) and pickled == settings:
            # Log.msg('The provided BLAST settings and query sequences did '
            #         'not change since the previous run.')
            Log.msg('BLAST results already exist:', se)

        else:
            changed_blast_1 = True
            make_dirs(dir_results)
            Log.msg('Running tblastn on: ' + basename(blast_db_path), ss)
            run_blast(exec_file=tblastn,
                      task='tblastn',
                      threads=threads,
                      db_path=blast_db_path,
                      queries_file=aa_queries_file,
                      out_file=out_f,
                      evalue=blast_1_evalue,
                      max_hsps=blast_1_max_hsps,
                      qcov_hsp_perc=blast_1_qcov_hsp_perc,
                      best_hit_overhang=blast_1_best_hit_overhang,
                      best_hit_score_edge=blast_1_best_hit_score_edge,
                      max_target_seqs=blast_1_max_target_seqs,
                      db_genetic_code=genetic_code,
                      out_cols=BLST_RES_COLS_1)

            Log.inf('Extracting unique BLAST hits using Seqtk:', ss)

            keep_unique_lines_in_file(out_f)

            seqtk_extract_reads(seqtk, fq_path, out_f_fastq, out_f)
            seqtk_fq_to_fa(seqtk, out_f_fastq, out_f_fasta)

            osremove(out_f)
            osremove(out_f_fastq)

            out_f_fasta_temp = out_f_fasta + '_temp'
            copyfile(out_f_fasta, out_f_fasta_temp)
            run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta)
            osremove(out_f_fasta_temp)

    for pe in pe_fastq_files:
        dir_results = opj(dir_blast_results_fa_trim, pe)
        blast_db_paths = pe_fastq_files[pe]['blast_db_path']
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_results) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x.replace('@Q@', ss) for x in out_fs]
        out_fs_fastq = [x.replace('.txt', '.fastq') for x in out_fs]
        out_fs_fasta = [x.replace('.txt', '.fasta') for x in out_fs]
        out_f_fasta = opj(dir_results, pe + '__' + ss + '.fasta')
        pe_fastq_files[pe]['blast_results_path' + '__' + ss] = out_f_fasta
        genetic_code = pe_fastq_files[pe]['gc_id']

        if ope(out_f_fasta) and ope(cache_file):
            with open(cache_file, 'rb') as f:
                pickled = pickle.load(f)

        if ope(out_f_fasta) and pickled == settings:
            # Log.msg('The provided BLAST settings and query sequences did '
            #         'not change since the previous run.')
            Log.msg('BLAST results already exist:', pe)

        else:
            changed_blast_1 = True
            make_dirs(dir_results)
            pe_trim_files = zip(blast_db_paths, out_fs, fq_paths, out_fs_fastq,
                                out_fs_fasta)
            for x in pe_trim_files:
                Log.msg('Running tblastn on: ' + basename(x[0]), ss)
                run_blast(exec_file=tblastn,
                          task='tblastn',
                          threads=threads,
                          db_path=x[0],
                          queries_file=aa_queries_file,
                          out_file=x[1],
                          evalue=blast_1_evalue,
                          max_hsps=blast_1_max_hsps,
                          qcov_hsp_perc=blast_1_qcov_hsp_perc,
                          best_hit_overhang=blast_1_best_hit_overhang,
                          best_hit_score_edge=blast_1_best_hit_score_edge,
                          max_target_seqs=blast_1_max_target_seqs,
                          db_genetic_code=genetic_code,
                          out_cols=BLST_RES_COLS_1)

                Log.msg('Extracting unique BLAST hits using Seqtk:', ss)

                keep_unique_lines_in_file(x[1])

                seqtk_extract_reads(seqtk, x[2], x[3], x[1])
                seqtk_fq_to_fa(seqtk, x[3], x[4])

                osremove(x[1])
                osremove(x[3])

            combine_text_files(out_fs_fasta, out_f_fasta)

            out_f_fasta_temp = out_f_fasta + '_temp'
            copyfile(out_f_fasta, out_f_fasta_temp)
            run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta)
            osremove(out_f_fasta_temp)

            for x in out_fs_fasta:
                osremove(x)

    with open(cache_file, 'wb') as f:
        pickle.dump(settings, f, protocol=PICKLE_PROTOCOL)

    return changed_blast_1
示例#3
0
def find_orfs_translate(ss, assemblies, dir_prj_transcripts, seqtk, dir_temp,
                        prepend_assmbl, min_target_orf_len, max_target_orf_len,
                        allow_non_aug, allow_no_strt_cod, allow_no_stop_cod,
                        tax, tax_group, tax_ids_user, min_overlap, organelle):

    if len(assemblies) > 0:
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    for a in assemblies:

        if ('blast_hits_aa__' + ss) not in a:
            continue

        assmbl_name = a['name']
        tax_id = a['tax_id']

        parsed_hits = a['blast_hits_aa__' + ss]

        a_path = a['path']

        gc_tt = a['gc_tt']
        if tax.is_eukaryote(tax_id) is True:
            if organelle == 'mitochondrion':
                gc_tt = a['gc_tt_mito']
            if tax.contains_plastid(tax_id) is True:
                if organelle == 'plastid':
                    gc_tt = a['gc_tt_plastid']

        transcripts_nt_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_nt__' + ss + '.fasta')

        transcripts_nt_orf_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_nt_orf__' + ss + '.fasta')

        transcripts_aa_orf_fasta_file = opj(
            dir_prj_transcripts,
            assmbl_name + '_transcripts_aa_orf__' + ss + '.fasta')

        transcripts_nt = {}
        transcripts_nt_orf = {}
        transcripts_aa_orf = {}

        transcripts_with_acceptable_orfs = set()

        ann_key = 'annotations__'

        a[ann_key + ss] = {}

        collated = collate_blast_results(parsed_hits)

        ######################################################################
        # Use seqtk to sample the assembly FASTA file for sequences with
        # BLAST hits. This increases the speed substantially when the assembly
        # file is large.
        temp_a_file = opj(dir_temp, 'temp__' + ss + '.fasta')
        temp_s_file = opj(dir_temp, 'temp__' + ss + '.txt')
        sseqids_subsample = []
        for hit in collated:
            target_name = hit['sseqid']
            sseqids_subsample.append(target_name)
        sseqids_subsample_text = '\n'.join(sseqids_subsample)
        with open(temp_s_file, 'w') as f:
            f.write(sseqids_subsample_text)
        seqtk_extract_reads(seqtk,
                            in_file=a_path,
                            out_file=temp_a_file,
                            ids_file=temp_s_file)

        with open(temp_a_file, 'r') as f:
            _ = f.read()

        if _.strip() == '':
            continue

        print()
        Log.inf('Analyzing BLAST hits', '=' * 113 + '\n')
        Log.msg('Assembly:', assmbl_name, False)
        Log.msg('Search Strategy:', ss + '\n\n' + '-' * 134 + '\n', False)

        parsed_fasta = trim_desc_to_first_space_in_fasta_text(_, SEQ_TYPE_DNA)
        parsed_fasta = seq_records_to_dict(parsed_fasta)
        ######################################################################

        all_kakapo_results = {}
        json_dump_file_path = opj(dir_prj_transcripts,
                                  assmbl_name + '_ann_kakapo__' + ss + '.json')

        for hit in collated:

            target_name = hit['sseqid']
            target_seq = parsed_fasta[target_name]
            query_name = hit['qseqid']
            hit_evalue = hit['evalue']

            # Prepend assembly name to the sequence name:
            if prepend_assmbl is True:
                target_name = assmbl_name + '__' + target_name
                # Also prepend taxonomic info to the sequence name:
                if tax_id is not None:
                    fm = tax.higher_rank_for_taxid(tax_id, rank='family')
                    if fm is not None:
                        target_name = fm + '__' + target_name

            hit_start = hit['start']
            hit_end = hit['end']
            hit_frame = hit['frame']

            if allow_non_aug is True:
                start_codons = gc_tt.start_codons_ambiguous
            else:
                start_codons = ['ATG']

            stop_codons = gc_tt.stop_codons_ambiguous

            ##################################################################
            if tax_id is not None:
                tax_ids_for_orf = (tax_id, )
            else:
                tax_ids_for_orf = tax_ids_user

            cntx_txids_avail = tuple(
                sorted(
                    set(
                        map(lambda x: int(x.split('_')[0]),
                            atg_contexts.keys()))))

            cntx_taxid = set()
            for txid in tax_ids_for_orf:
                tax_path = partial(tax.path_between_taxids, txid)
                path_len = tuple(
                    map(len, tuple(map(tax_path, cntx_txids_avail))))
                cntx_taxid.add(cntx_txids_avail[path_len.index(min(path_len))])
            cntx_taxid = tuple(cntx_taxid)[0]

            cntx_l_key = str(cntx_taxid) + '_L'
            cntx_r_key = str(cntx_taxid) + '_R'

            cntx_l = atg_contexts[cntx_l_key]
            cntx_r = atg_contexts[cntx_r_key]
            ##################################################################

            orf_log_str = ('grade'.rjust(5) + 'ovrlp'.rjust(7) +
                           'cntx'.rjust(6) + 'length'.center(9) +
                           'cntx_l'.rjust(7) + 'cntx_r'.rjust(15) + '\n')

            orf = find_orf_for_blast_hit(seq=target_seq,
                                         frame=hit_frame,
                                         hit_start=hit_start,
                                         hit_end=hit_end,
                                         stop_codons=stop_codons,
                                         start_codons=start_codons,
                                         context_l=cntx_l,
                                         context_r=cntx_r,
                                         min_overlap=min_overlap,
                                         min_len=min_target_orf_len,
                                         max_len=max_target_orf_len,
                                         allow_no_strt_cod=allow_no_strt_cod,
                                         allow_no_stop_cod=allow_no_stop_cod)

            orf_log_str += orf[2]

            rev_comp_def_str = ''
            if hit_frame > 0:
                ann_hit_b = hit_start
                ann_hit_e = hit_end
            else:
                target_seq = reverse_complement(target_seq)
                ann_hit_b = len(target_seq) - hit_start
                ann_hit_e = len(target_seq) - hit_end
                rev_comp_def_str = '; RevComp'

            target_def = target_name + ' ' + query_name + rev_comp_def_str

            a[ann_key + ss][target_name] = {}

            good_orfs = orf[0]
            bad_orfs = orf[1]

            if len(good_orfs) > 0:
                a[ann_key + ss][target_name]['orfs_good'] = dict()
                orfs_good_dict = a[ann_key + ss][target_name]['orfs_good']
                orf_log_str += '\n' + 'VALID ' + '-' * 128 + '\n'

                for i, good_orf in enumerate(good_orfs):

                    good_orf_frame = good_orf[2]

                    if good_orf_frame > 0:
                        ann_orf_b = good_orf[0]
                        ann_orf_e = good_orf[1] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]
                    else:
                        ann_orf_b = len(target_seq) - good_orf[1]
                        ann_orf_e = len(target_seq) - good_orf[0] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]

                    orf_good_dict = dict()
                    orf_good_dict['orf_begin'] = ann_orf_b
                    orf_good_dict['orf_end'] = ann_orf_e
                    orf_good_dict['orf_frame'] = abs(good_orf_frame)
                    orf_good_dict['orf_grade'] = good_orf[3]
                    orf_good_dict['orf_tt_id'] = str(gc_tt.gc_id)
                    orf_good_dict['orf_tt_name'] = gc_tt.gc_name

                    orfs_good_dict['ORF{:03d}'.format(i + 1)] = orf_good_dict

                    target_def_orf = (target_name +
                                      '__ORF{:03d}'.format(i + 1) + ' ' +
                                      query_name + rev_comp_def_str)

                    transcripts_nt_orf[target_def_orf] = orf_seq

                    transcripts_with_acceptable_orfs.add(target_name)

                    transl_seq = translate(orf_seq, gc_tt.table_ambiguous,
                                           start_codons)

                    transcripts_aa_orf[target_def_orf] = transl_seq[:-1]

            else:
                orf_log_str += '\n' + 'NOT VALID ' + '-' * 124 + '\n'

            Log.msg('Transcript:', target_name, False)
            Log.msg('     Query:', query_name + '\n\n' + orf_log_str, False)

            if len(bad_orfs) > 0:
                a[ann_key + ss][target_name]['orfs_bad'] = dict()
                orfs_bad_dict = a[ann_key + ss][target_name]['orfs_bad']

                for i, bad_orf in enumerate(bad_orfs):

                    bad_orf_frame = bad_orf[2]

                    if bad_orf_frame > 0:
                        ann_orf_b = bad_orf[0]
                        ann_orf_e = bad_orf[1] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]
                    else:
                        ann_orf_b = len(target_seq) - bad_orf[1]
                        ann_orf_e = len(target_seq) - bad_orf[0] + 3
                        orf_seq = target_seq[ann_orf_b:ann_orf_e]

                    orf_bad_dict = dict()
                    orf_bad_dict['orf_begin'] = ann_orf_b
                    orf_bad_dict['orf_end'] = ann_orf_e
                    orf_bad_dict['orf_frame'] = abs(bad_orf_frame)
                    orf_bad_dict['orf_grade'] = bad_orf[3]
                    orf_bad_dict['orf_tt_id'] = str(gc_tt.gc_id)
                    orf_bad_dict['orf_tt_name'] = gc_tt.gc_name

                    orfs_bad_dict['ORF{:03d}'.format(i + 1)] = orf_bad_dict

            transcripts_nt[target_def] = target_seq

            a[ann_key + ss][target_name]['blast_hit'] = dict()
            blast_hit_dict = a[ann_key + ss][target_name]['blast_hit']
            blast_hit_dict['query_name'] = query_name
            blast_hit_dict['query_id'] = ss
            blast_hit_dict['evalue'] = hit_evalue
            blast_hit_dict['frame'] = abs(hit_frame)
            blast_hit_dict['blast_hit_begin'] = ann_hit_b
            blast_hit_dict['blast_hit_end'] = ann_hit_e

            # Collect ORF and BLAST hit annotations for downstream use. ######
            kakapo_json = [{}]
            kakapo_json[0]['kakapo_annotations__' + ss] = (a[ann_key +
                                                             ss][target_name])
            all_kakapo_results[target_name] = kakapo_json
            ##################################################################

        # --------------------------------------------------------------------

        Log.msg('Assembly:', assmbl_name, False)
        Log.msg('Search Strategy:', ss, False)
        Log.msg('Transcripts:', str(len(transcripts_nt)), False)
        Log.msg('Transcripts with acceptable ORFs:',
                str(len(transcripts_with_acceptable_orfs)) + '\n' + '=' * 134,
                False)

        if len(transcripts_nt) > 0:
            write_fasta(transcripts_nt, transcripts_nt_fasta_file)
            a['transcripts_nt_fasta_file__' + ss] = transcripts_nt_fasta_file
        else:
            a['transcripts_nt_fasta_file__' + ss] = None

        if len(transcripts_nt_orf) > 0:
            write_fasta(transcripts_nt_orf, transcripts_nt_orf_fasta_file)
            a['transcripts_nt_orf_fasta_file__' +
              ss] = transcripts_nt_orf_fasta_file
        else:
            a['transcripts_nt_orf_fasta_file__' + ss] = None

        if len(transcripts_aa_orf) > 0:
            write_fasta(transcripts_aa_orf, transcripts_aa_orf_fasta_file)
            a['transcripts_aa_orf_fasta_file__' +
              ss] = transcripts_aa_orf_fasta_file
        else:
            a['transcripts_aa_orf_fasta_file__' + ss] = None

        # Save ORF and BLAST hit annotations for downstream use.--------------
        with open(json_dump_file_path, 'w') as f:
            json.dump(all_kakapo_results, f, sort_keys=True, indent=4)