Exemplo n.º 1
0
def mcl_predict(blast_results_file, min_ident, min_cov, evalue, min_length,
                tmp_dir):
    if os.path.getsize(blast_results_file) == 0:
        return dict()

    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['qlen'] <= 400000]
    blast_df = blast_df.loc[blast_df['qlen'] >= min_length]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qlen'] >= min_length]
    blast_df = blast_df.reset_index(drop=True)
    for index, row in blast_df.iterrows():
        (seqid, clust_id) = row[1].split('|')
        blast_df.iloc[index, blast_df.columns.get_loc('sseqid')] = clust_id

    filtered_blast = os.path.join(tmp_dir, 'filtered_mcl_blast.txt')
    blast_df.to_csv(filtered_blast,
                    sep='\t',
                    header=False,
                    line_terminator='\n',
                    index=False)
    mcl_clusters = mcl(filtered_blast, tmp_dir).getclusters()

    return mcl_clusters
Exemplo n.º 2
0
def tblastn(input_fasta, blastdb, min_ident, min_covs, evalue, out_dir, blast_results_file, logging, num_threads=1,
            min_covhsp=25, seq_id_file=None):
    blast_runner = BlastRunner(input_fasta, out_dir)

    blast_runner.run_tblastn(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb,
                             db_type='protein', min_cov=min_covs, min_ident=min_ident, evalue=evalue,
                             blast_outfile=blast_results_file,
                             num_threads=num_threads, seq_id_file=seq_id_file, logging=logging)

    if os.path.getsize(blast_results_file) == 0:
        os.remove(blast_results_file)
        return False

    blast_df = BlastReader(blast_results_file, logging).df

    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_covs]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_covhsp]
    blast_df = blast_df.loc[blast_df['evalue'] <= evalue]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)
    blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False)

    return True
Exemplo n.º 3
0
def blastn(input_fasta, blastdb, min_ident, min_cov, evalue, min_length, out_dir, blast_results_file, logging,
           seq_filterfile=None, num_threads=1, max_length=400000, min_hsp_cov=1):
    blast_runner = BlastRunner(input_fasta, out_dir)
    blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb,
                           db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue,
                           blast_outfile=blast_results_file, logging=logging, num_threads=num_threads, word_size=11,
                           seq_id_file=seq_filterfile)

    if os.path.getsize(blast_results_file) == 0:
        os.remove(blast_results_file)
        return False

    blast_df = BlastReader(blast_results_file, logging).df

    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['qlen'] <= max_length]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_hsp_cov]
    blast_df = blast_df.loc[blast_df['evalue'] <= evalue]
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]

    blast_df = blast_df.reset_index(drop=True)
    blast_df = fixStart(blast_df)
    blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False)

    return True
Exemplo n.º 4
0
def contig_blast(input_fasta, plasmid_db, min_ident, min_cov, evalue, min_length, tmp_dir, blast_results_file,
                 num_threads=1):
    blast_runner = None
    filtered_blast = os.path.join(tmp_dir, 'filtered_blast.txt')
    blast_runner = BlastRunner(input_fasta, tmp_dir)
    blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=plasmid_db,
                           db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue,
                           blast_outfile=blast_results_file, num_threads=num_threads, word_size=11)
    if os.path.getsize(blast_results_file) == 0:
        fh = open(filtered_blast, 'w', encoding="utf-8")
        fh.write('')
        fh.close()
        return dict()
    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['qlen'] <= 400000]
    blast_df = blast_df.loc[blast_df['qlen'] >= min_length]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.reset_index(drop=True)
    blast_df.to_csv(filtered_blast, sep='\t', header=False, line_terminator='\n', index=False)
Exemplo n.º 5
0
    def run_blast(self,
                  input_fasta,
                  output_path,
                  blast_results_file,
                  logging,
                  min_cov=1,
                  min_ident=1,
                  evalue=1,
                  num_threads=1,
                  min_length=25):
        blast_runner = BlastRunner(input_fasta, output_path)
        blast_runner.makeblastdb(input_fasta, 'nucl', logging)
        blast_runner.run_blast(query_fasta_path=input_fasta,
                               blast_task='megablast',
                               db_path=input_fasta,
                               db_type='nucl',
                               min_cov=min_cov,
                               min_ident=min_ident,
                               evalue=evalue,
                               blast_outfile=blast_results_file,
                               num_threads=num_threads,
                               word_size=11,
                               logging=logging)

        if os.path.getsize(blast_results_file) == 0:
            fh = open(blast_results_file, 'w', encoding="utf-8")
            fh.write('')
            fh.close()
            return dict()

        blast_df = BlastReader(blast_results_file, logging).df
        blast_df = blast_df.loc[blast_df['length'] >= min_length]
        blast_df = blast_df.reset_index(drop=True)
        blast_df.to_csv(blast_results_file,
                        sep='\t',
                        header=False,
                        line_terminator='\n',
                        index=False)