Exemplo n.º 1
0
def mcl_predict(blast_results_file, min_ident, min_cov, evalue, min_length,
                tmp_dir):
    if os.path.getsize(blast_results_file) == 0:
        return dict()

    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['qlen'] <= 400000]
    blast_df = blast_df.loc[blast_df['qlen'] >= min_length]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qlen'] >= min_length]
    blast_df = blast_df.reset_index(drop=True)
    for index, row in blast_df.iterrows():
        (seqid, clust_id) = row[1].split('|')
        blast_df.iloc[index, blast_df.columns.get_loc('sseqid')] = clust_id

    filtered_blast = os.path.join(tmp_dir, 'filtered_mcl_blast.txt')
    blast_df.to_csv(filtered_blast,
                    sep='\t',
                    header=False,
                    line_terminator='\n',
                    index=False)
    mcl_clusters = mcl(filtered_blast, tmp_dir).getclusters()

    return mcl_clusters
Exemplo n.º 2
0
def repetitive_blast(input_fasta,
                     ref_db,
                     min_ident,
                     min_cov,
                     evalue,
                     min_length,
                     tmp_dir,
                     blast_results_file,
                     num_threads=1):
    blast_runner = BlastRunner(input_fasta, tmp_dir)
    #blast_runner.makeblastdb(ref_db, 'nucl')
    blast_runner.run_blast(query_fasta_path=input_fasta,
                           blast_task='megablast',
                           db_path=ref_db,
                           db_type='nucl',
                           min_cov=min_cov,
                           min_ident=min_ident,
                           evalue=evalue,
                           blast_outfile=blast_results_file,
                           num_threads=num_threads)
    if os.path.getsize(blast_results_file) == 0:
        return dict()

    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)

    contig_list = dict()
    for index, row in blast_df.iterrows():
        if not row['qseqid'] in contig_list:
            contig_list[row['qseqid']] = {
                'id': row['sseqid'],
                'score': row['bitscore'],
                'contig_start': row['sstart'],
                'contig_end': row['send']
            }
        else:
            if contig_list[row['qseqid']]['score'] > row['bitscore']:
                contig_list[row['qseqid']] = {
                    'id': row['sseqid'],
                    'score': row['bitscore'],
                    'contig_start': row['sstart'],
                    'contig_end': row['send']
                }

    return contig_list
Exemplo n.º 3
0
    def overhangDetection(self, blast_results_file, min_length=25):
        if os.path.getsize(blast_results_file) == 0:
            return dict()

        blast_df = BlastReader(blast_results_file).df

        circular_contigs = {}

        for index, row in blast_df.iterrows():
            contig_id_query = row['qseqid']
            contig_id_subject = row['sseqid']
            contig_start_subject = row['sstart']
            contig_end_subject = row['send']
            contig_start_query = row['qstart']
            contig_end_query = row['qend']
            contig_length = row['qlen']
            mid_point = int(contig_length / 2)
            length = row['length']

            if contig_id_query != contig_id_subject:
                continue

            if contig_start_query != 1 or length < min_length:
                continue

            if contig_start_query == contig_start_subject and contig_end_query == contig_end_subject:
                continue


            if (contig_start_query == 1 and contig_end_query == mid_point) or \
                (contig_start_query == mid_point+1 and contig_end_query == contig_length):
                circular_contigs[
                    contig_id_query] = 'Circular: Complete concatemer'
            elif contig_start_query == 1 and contig_end_subject == contig_length:
                circular_contigs[
                    contig_id_query] = 'Circular: Overlap {} bp'.format(length)

        return circular_contigs
Exemplo n.º 4
0
    def overhangDetection(self, blast_results_file, logging, min_length=25):
        if os.path.getsize(blast_results_file) == 0:
            return dict()

        blast_df = BlastReader(blast_results_file, logging).df.sort_values(
            ['qseqid', 'qstart', 'qend', 'bitscore'],
            ascending=[True, True, True, False])

        circular_contigs = {}

        for index, row in blast_df.iterrows():
            contig_id_query = row['qseqid']
            contig_id_subject = row['sseqid']
            contig_start_subject = int(row['sstart'])
            contig_end_subject = int(row['send'])
            contig_start_query = int(row['qstart'])
            contig_end_query = int(row['qend'])
            contig_length = int(row['qlen'])
            length = int(row['length'])

            if contig_id_query != contig_id_subject and contig_id_subject != "ref|{}|".format(
                    contig_id_query):
                continue

            if contig_start_query != 1 or length < min_length:

                continue

            if contig_start_query == contig_start_subject and contig_end_query == contig_end_subject:

                continue

            if contig_start_query == 1 and contig_end_subject == contig_length:
                circular_contigs[
                    contig_id_query] = 'Circular: Overlap {} bp'.format(length)

        return circular_contigs
Exemplo n.º 5
0
def contig_blast_group(blast_results_file, overlap_threshold):
    if os.path.getsize(blast_results_file) == 0:
        return dict()
    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])

    blast_df = filter_overlaping_records(blast_df, overlap_threshold, 'sseqid',
                                         'sstart', 'send', 'bitscore')
    size = str(len(blast_df))
    prev_size = 0
    while size != prev_size:
        blast_df = filter_overlaping_records(blast_df, overlap_threshold,
                                             'sseqid', 'sstart', 'send',
                                             'bitscore')
        prev_size = size
        size = str(len(blast_df))

    cluster_scores = dict()
    groups = dict()
    hits = dict()
    contigs = dict()
    for index, row in blast_df.iterrows():
        query = row['qseqid']
        pID, clust_id = row['sseqid'].split('|')
        score = row['bitscore']
        pLen = row['slen']
        contig_id = row['qseqid']

        if not pID in hits:
            hits[pID] = {
                'score': 0,
                'length': pLen,
                'covered_bases': 0,
                'clust_id': clust_id
            }

        if not clust_id in cluster_scores:
            cluster_scores[clust_id] = score
        elif score > cluster_scores[clust_id]:
            cluster_scores[clust_id] = score

        if not clust_id in groups:
            groups[clust_id] = dict()

        if not query in groups[clust_id]:
            groups[clust_id][query] = dict()

        if not contig_id in contigs:
            contigs[contig_id] = dict()

        if not clust_id in contigs[contig_id]:
            contigs[contig_id][clust_id] = 0

        if contigs[contig_id][clust_id] < score:
            contigs[contig_id][clust_id] = score

        groups[clust_id][query][contig_id] = score

        hits[pID]['score'] += score
        hits[pID]['covered_bases'] += score

    sorted_d = OrderedDict(
        sorted(iter(list(cluster_scores.items())),
               key=lambda x: x[1],
               reverse=True))

    for clust_id in sorted_d:
        score = sorted_d[clust_id]
        for contig_id in contigs:
            if clust_id in contigs[contig_id]:
                contigs[contig_id] = {clust_id: contigs[contig_id][clust_id]}

    return contigs