示例#1
0
def _run_all(out_dir, query, blastdb, num_iterations, num_threads, template,
             flann_x, flann_y, flann_index, num_neighbors, score_out_name,
             score_matrix, open_penalty, extend_penalty, alignment_out_name):
    o_path = Path(out_dir).expanduser().absolute()
    o_path.mkdir(exist_ok=True, parents=True)
    q_path = Path(query).expanduser().absolute()
    NcbipsiblastCommandline(query=q_path,
                            db=blastdb,
                            num_iterations=num_iterations,
                            num_threads=num_threads,
                            out_ascii_pssm=o_path / (q_path.stem + '.mtx'),
                            save_pssm_after_last_round=True)()
    t_path = Path(template).expanduser().absolute()
    NcbipsiblastCommandline(query=t_path,
                            db=blastdb,
                            num_iterations=num_iterations,
                            num_threads=num_threads,
                            out_ascii_pssm=o_path / (t_path.stem + '.mtx'),
                            save_pssm_after_last_round=True)()

    machina.predict.predict_scores(o_path / (q_path.stem + '.mtx'),
                                   o_path / (t_path.stem + '.mtx'),
                                   flann_x=Path(flann_x),
                                   flann_y=Path(flann_y),
                                   flann_index=Path(flann_index),
                                   num_neighbors=num_neighbors,
                                   out_dir=o_path,
                                   out_name=Path(score_out_name))

    machina.generate_alignment.alignment_local_and_save(
        o_path / score_matrix, o_path / (q_path.stem + '.mtx'),
        o_path / (t_path.stem + '.mtx'), -open_penalty, -extend_penalty,
        o_path, Path(alignment_out_name))
示例#2
0
 def generate_pairwise_alignment(self, query_id: str, target_id: str,
                                 out_dir: str, pssm_dir: str):
     Path(f'{out_dir}/{query_id}').mkdir(parents=True, exist_ok=True)
     SeqIO.write(
         SeqRecord(_pp(f'{pssm_dir}/{query_id[2:4]}/{query_id}.mtx'),
                   id=query_id), 'query.fasta', 'fasta')
     SeqIO.write(
         SeqRecord(_pp(f'{pssm_dir}/{target_id[2:4]}/{target_id}.mtx'),
                   id=target_id), 'subject.fasta', 'fasta')
     if self.algo == 'psiblast':
         if not Path(
                 f'{pssm_dir}/{query_id[2:4]}/{query_id}.pssm').exists():
             NcbipsiblastCommandline(
                 db=f'{self.blast_db_dir}/uniref90',
                 num_iterations=3,
                 out_pssm=f'{pssm_dir}/{query_id[2:4]}/{query_id}.pssm',
                 query='query.fasta',
                 save_pssm_after_last_round=True,
                 num_threads=os.cpu_count())()
         NcbipsiblastCommandline(
             in_pssm=f'{pssm_dir}/{query_id[2:4]}/{query_id}.pssm',
             evalue=99999,
             subject='subject.fasta',
             outfmt=5,
             out=f'{out_dir}/{query_id}/{target_id}.xml')()
     elif self.algo == 'deltablast':
         NcbideltablastCommandline(
             subject='subject.fasta',
             rpsdb=f'{self.blast_db_dir}/cdd_delta',
             evalue=99999,
             outfmt=5,
             out=f'{out_dir}/{query_id}/{target_id}.xml',
             query='query.fasta')()
     Path('query.fasta').unlink()
     Path('subject.fasta').unlink()
示例#3
0
def create_profile(query, template, blastdb, num_iterations, cmd, out_dir):
    o_path = Path(out_dir).expanduser().absolute()
    o_path.mkdir(exist_ok=True, parents=True)
    q_path = Path(query).expanduser().absolute()
    NcbipsiblastCommandline(query=q_path,
                            db=blastdb,
                            num_iterations=num_iterations,
                            out_ascii_pssm=o_path / (q_path.stem + '.mtx'),
                            save_pssm_after_last_round=True)()
    q_path = Path(template).expanduser().absolute()
    NcbipsiblastCommandline(query=q_path,
                            db=blastdb,
                            num_iterations=num_iterations,
                            out_ascii_pssm=o_path / (q_path.stem + '.mtx'),
                            save_pssm_after_last_round=True)()
示例#4
0
def do_psiblast(dirpath, rec):
    """
    Run a PSIBLAST query on the given sequence.
    """

    # save the query to a fasta file
    query_basepath = Path(dirpath, rec.id)
    SeqIO.write(rec, str(query_basepath) + ".fasta", "fasta")

    # build the query
    query = str(query_basepath) + ".fasta"
    db = "cpdb2_db"
    evalue = 0.001
    outfmt = 5
    out = str(query_basepath) + "_blast.xml"
    num_threads = 6
    num_iterations = 3
    # out_pssm=str(query_basepath)+"_blast.pssm"
    # out_ascii_pssm=str(query_basepath)+"_blast.ascii_pssm"
    # save_pssm_after_last_round=True
    try:
        psib_cline = NcbipsiblastCommandline(query=query,
                                             db=db,
                                             evalue=evalue,
                                             outfmt=outfmt,
                                             out=out,
                                             num_threads=num_threads,
                                             num_iterations=num_iterations)
        #print(psib_cline)
        stdout, stderr = psib_cline()
    except:
        print("Failed to run PSIBLAST on record %s" % rec.id)
        return -1

    return 1
示例#5
0
def process_input_user(seq_file, dir):
    processed_num = 0
    if not os.path.exists(dir):
        os.mkdir(dir)
    index = 0
    for seq_record in list(SeqIO.parse(seq_file, "fasta")):
        processed_num += 1
        print("in loop, processing" + str(processed_num) + "\n")
        pssmfile = dir + str(index) + "_pssm.txt"
        inputfile = dir + 'tempseq.fasta'
        seql = len(seq_record)
        if not os.path.exists(pssmfile):
            if os.path.exists(inputfile):
                os.remove(inputfile)
            SeqIO.write(seq_record, inputfile, 'fasta')
            try:
                psiblast_cline = NcbipsiblastCommandline(
                    query=inputfile,
                    db='./db/swissprot/swissprot',
                    num_iterations=3,
                    evalue=0.001,
                    out_ascii_pssm=pssmfile,
                    num_threads=4)
                stdout, stderr = psiblast_cline()
            except:
                print("invalid protein: " + seq_record)

        index = index + 1
def run_psiblast(path, database, in_pssm, out_xml):
    psiblast_cline = NcbipsiblastCommandline(db=f'{path}/{database}.db',
                                             in_pssm=f'{path}/{in_pssm}',
                                             out=f'{path}/{out_xml}',
                                             max_target_seqs=10000,
                                             outfmt=5)
    psiblast_cline()
示例#7
0
def psi_blaster(in_file, out_file):
    """Perform a PSI-BLAST via the refseq-protein.00
    database available locally and then get XML output. 
    
    Args: 
        in_file [str]: Path to the input file for BLAST search. 
        
    Returns: 
        out_psi.xml [file]: Tell the user operation is done 
    
    """

    in_file = str(in_file)
    print("Initiating PSI-BLAST...")

    cline = NcbipsiblastCommandline(query=in_file,
                                    outfmt=5,
                                    db="refseq_protein.00",
                                    num_iterations=3,
                                    out=out_file)

    cmd = str(cline)
    cmd = cmd.split(" ")

    return subprocess.run(cmd, check=True)
示例#8
0
def gen_pssm(fname, pssm_dir):
    #fname = './data/secreted_pos_30_train_0'
    seq_records = list(SeqIO.parse(fname, 'fasta'))
    i = 0
    for seq_record in seq_records:
        inputfile = 'input.fasta'
        if not os.path.exists(pssm_dir):
            os.mkdir(pssm_dir)

        pssmfile = pssm_dir + "/" + seq_record.id + "_pssm.txt"
        # psi-blast output file
        if not os.path.exists(pssmfile):
            # psi-blast input file
            if os.path.exists(inputfile):
                os.remove(inputfile)
            SeqIO.write(seq_record, inputfile, 'fasta')
            # psi-blast
            psiblast_cline = NcbipsiblastCommandline(query=inputfile,
                                                     db='Swissprot',
                                                     evalue=0.001,
                                                     num_iterations=3,
                                                     out_ascii_pssm=pssmfile)
            stdout, stderr = psiblast_cline()

            i = i + 1
            print("{:.2f}% ====> {}/{} finished".format(
                i / len(seq_records) * 100, i, len(seq_records)))
示例#9
0
def psiblast(query, db, out_dir, out_file):
    out_file = os.path.join(out_dir, "{}.pssm".format(out_file))
    psi_cline = NcbipsiblastCommandline('psiblast', db = db,
                                        query = query, evalue = 0.001,
                                        num_iterations = 3,
                                        save_pssm_after_last_round=True,
                                        out_ascii_pssm = out_file )

    psi_cline()
    return out_file
示例#10
0
def pssm_gen_ncbi(database, input_fasta, out_pssm, num_iter, num_thr):

    from Bio.Blast.Applications import NcbipsiblastCommandline

    psi_cline = NcbipsiblastCommandline('psiblast',
                                        db=database,
                                        query=inp_fasta,
                                        num_threads=num_thr,
                                        num_iterations=num_iter,
                                        outfmt=5,
                                        out_ascii_pssm=out_pssm)
    psi_cline()

    return
示例#11
0
def getPSSMFiles(fastafile, outfileprefix='', dbName='swissprot'):

    inputfile = 'input.fasta'

    for seq_record in SeqIO.parse(fastafile, 'fasta'):
        print('\r{} '.format(seq_record.id), end="")
        # psi-blast input file
        if os.path.exists(inputfile):
            os.remove(inputfile)
        SeqIO.write(seq_record, inputfile, 'fasta')

        # psi-blast output file
        pssmfile = "".join((outfileprefix, seq_record.id, '.txt'))
        if os.path.exists(pssmfile):
            os.remove(pssmfile)

        # psi-blast
        psiblast_cline = NcbipsiblastCommandline(query=inputfile,
                                                 db=dbName,
                                                 evalue=0.001,
                                                 num_iterations=3,
                                                 out_ascii_pssm=pssmfile)
        stdout, stderr = psiblast_cline()

        # If seq_record does not have pssm, generating it by blosum62 Matrix
        if not os.path.exists(pssmfile):
            print('\r{} does not have pssm'.format(seq_record.id))
            with open(pssmfile, 'w') as pw:
                pw.writelines("  \n")
                pw.writelines(
                    "last position-specific scoring matrix computed, weighted \n"
                )
                pw.writelines(alphabet + '\n')
                s = seq_record.seq

                k = 1
                for aa in s:
                    line = str(k) + ' ' + aa + ' '
                    k += 1
                    idx = alphabet.find(aa)
                    col = 0
                    for a in alphabet:
                        line = line + str(blosumMatrix[idx][col]) + ' '
                        col += 1
                    line = line + '\n'
                    pw.writelines(line)
示例#12
0
def upSampling_pssm(num_upsamp, upsamp_file, fpos, vrate, pssmdir):
    seq_records = list(SeqIO.parse(fpos, 'fasta'))
    N = len(seq_records)
    proteins = []
    i = 0
    inputfile = 'input.fasta'
    if not os.path.exists(pssmdir):
        os.mkdir(pssmdir)
    while True:
        k = np.random.randint(0, N)
        seq_record = seq_records[k]

        pssmfile = os.path.join(pssmdir, seq_record.id + "_pssm.txt")
        # psi-blast output file
        if not os.path.exists(pssmfile):
            # psi-blast input file
            if os.path.exists(inputfile):
                os.remove(inputfile)
            SeqIO.write(seq_record, inputfile, 'fasta')
            # psi-blast
            psiblast_cline = NcbipsiblastCommandline(query=inputfile,
                                                     db='Swissprot',
                                                     evalue=0.001,
                                                     num_iterations=3,
                                                     out_ascii_pssm=pssmfile)
            stdout, stderr = psiblast_cline()

            # If psi-blast didn't constructe pssm
            if not os.path.exists(pssmfile):
                continue

        pssm = readPSSM(pssmfile)
        prot = genSeq(str(seq_record.seq), pssm, vrate)
        fake_seq = Seq(prot, IUPAC.ExtendedIUPACProtein)
        fake_record = SeqRecord(fake_seq, id='fake' + str(i))
        proteins.append(fake_record)

        i = i + 1
        print("{:.2f}% ====> {}/{} finished".format(i / num_upsamp * 100, i,
                                                    num_upsamp))
        if i == num_upsamp:
            break
    SeqIO.write(proteins, upsamp_file, 'fasta')
示例#13
0
def _psi_blast_pssm(input):
    file, path, blast_db_path, evalue, num_iterations, sigmoid, tmp_fastas_path ,tmp_pssms_path  = input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7]
    # create and call command
    blastx_cline = NcbipsiblastCommandline(
        cmd='psiblast', 
        query=path+tmp_fastas_path+file, 
        db=blast_db_path, 
        evalue=evalue, 
        num_iterations=num_iterations,
        outfmt=5,
        num_threads=3,
        out_ascii_pssm=path+tmp_pssms_path+file.split('.')[0])()
    if os.path.isfile(path+tmp_pssms_path+file.split('.')[0]):
        pssm = _parse_pssm_ascii(path+tmp_pssms_path+file.split('.')[0])
    else:
        # TODO: what to do in this case?! e.g. no sequences returned by psiblast due to too low evalue
        return None
    if sigmoid:
        return _sigmoid(pssm)
    else:
        return pssm
示例#14
0
def generate_sequence_profiles(structural_alignment_path,
                               pssm_dir,
                               blastdb='uniref90'):
    results = []
    seq_index = SeqIO.index(structural_alignment_path, 'fasta')
    for record_id in seq_index:
        results.append(record_id.split('&')[0])
        results.append(record_id.split('&')[1])

    for domain in tqdm(set(results)):
        if Path(f'{pssm_dir}/{domain[2:4]}/{domain}.mtx').exists():
            continue
        Path(f'{pssm_dir}/{domain[2:4]}').mkdir(parents=True, exist_ok=True)
        seq_record = seq_index[[
            _ for _ in seq_index if _.startswith(f'{domain}&')
        ][0]]
        NcbipsiblastCommandline(
            db=blastdb,
            num_threads=os.cpu_count(),
            num_iterations=3,
            out_ascii_pssm=Path(
                f'{pssm_dir}/{domain[2:4]}/{domain}.mtx').as_posix(),
            save_pssm_after_last_round=True)(stdin=str(seq_record.seq))
示例#15
0
def BLAST_submission(task):
    """Run a single PSI-BLAST search

    Parameters
    ----------
    task : array
        List/array which contains the query data and the parameters for the BLAST search

    Saved Files and Figures
    -----------------------
    PROTEIN_NAME.blast_result.xml : xml file containing PSI-BLAST hits
        (e.g. A2A2V5.blast_result.xml)
    PROTIEN_NAME.blast_result.pssm : pssm file containing PSI-BLAST similiarity scoring matrix
        (e.g. A2A2V5.blast_result.pssm)
    """
    #Obtain query and parameters
    protein_name, query, output_hit_file, output_pssm_file, evalue, hitsize, overwrite_results, database,\
        remote_tag, compress_results = task
    #IF overwrite existing results is FALSE
    if not overwrite_results:
        #check if compressed file exist and is not empty: IF yes return and don't execute PSI-BLAST
        if os.path.exists(output_hit_file +
                          ".gz") and os.path.getsize(output_hit_file +
                                                     ".gz") > 0:
            #IF compress results is TURE -> return and don't execute PSI-BLAST
            if compress_results:
                #logging.warning("Skipped PSI-BLAST search for protein:" + "\t" + protein_name)
                return
            #ELSE decompress file, return and don't execute PSI-BLAST
            else:
                with gzip.open(output_hit_file + ".gz",
                               'rb') as blast_result_in, open(
                                   output_hit_file, 'wb') as blast_result_out:
                    blast_result_out.write(blast_result_in.read())
                os.remove(output_hit_file + ".gz")
                with gzip.open(output_pssm_file + ".gz",
                               'rb') as blast_result_in, open(
                                   output_pssm_file, 'wb') as blast_result_out:
                    blast_result_out.write(blast_result_in.read())
                os.remove(output_pssm_file + ".gz")
                #logging.warning("Skipped PSI-BLAST search for protein:" + "\t" + protein_name)
                return
        #check if non compressed file exist and is not empty:
        if os.path.exists(
                output_hit_file) and os.path.getsize(output_hit_file) > 0:
            #IF compress results is TURE -> compress the existing result file, return and don't execute PSI-BLAST
            if compress_results:
                with open(output_hit_file, "rb") as blast_result_in, gzip.open(
                        output_hit_file + ".gz", 'wb') as blast_result_out:
                    blast_result_out.writelines(blast_result_in)
                os.remove(output_hit_file)
                with open(output_pssm_file,
                          "rb") as blast_result_in, gzip.open(
                              output_pssm_file + ".gz",
                              'wb') as blast_result_out:
                    blast_result_out.writelines(blast_result_in)
                os.remove(output_pssm_file)
                #logging.warning("Skipped PSI-BLAST search for protein:" + "\t" + protein_name)
                return
            #ELSE: return and don't execute PSI-BLAST
            else:
                #logging.warning("Skipped PSI-BLAST search for protein:" + "\t" + protein_name)
                return
    #ELSE remove existing results
    else:
        #remove raw results
        if os.path.exists(output_hit_file):
            os.remove(output_file)
        if os.path.exists(output_pssm_file):
            os.remove(output_pssm_file)
        #remove compressed results
        if os.path.exists(output_hit_file + ".gz"):
            os.remove(output_hit_file + ".gz")
        if os.path.exists(output_pssm_file + ".gz"):
            os.remove(output_pssm_file + ".gz")

    #Run PSI-BLAST search
    #logging.info("Run PSI-BLAST search for protein:" + "\t" + protein_name)

    #TODO: Add -out parameter for the hits and out_ascii_pssm for the pssm file
    #TODO: Both compressed if required
    psiblast_cline = NcbipsiblastCommandline('psiblast',
                                             db=database,
                                             evalue=evalue,
                                             max_target_seqs=hitsize,
                                             outfmt=5,
                                             out=output_hit_file,
                                             out_ascii_pssm=output_pssm_file,
                                             remote=remote_tag,
                                             inclusion_ethresh=evalue,
                                             num_iterations=3,
                                             use_sw_tback=True,
                                             seg="no")
    out, err = psiblast_cline(stdin=query)

    #Exception handling of BLAST execution
    if out or err:
        logging.warning(out)
        logging.warning(err)
        return

    #IF compress results is TURE -> compress the blast xml result file
    if compress_results:
        if os.path.exists(output_hit_file):
            with open(output_hit_file, "rb") as blast_result_in, gzip.open(
                    output_hit_file + ".gz", 'wb') as blast_result_out:
                blast_result_out.writelines(blast_result_in)
            os.remove(output_hit_file)
        if os.path.exists(output_pssm_file):
            with open(output_pssm_file, "rb") as blast_result_in, gzip.open(
                    output_pssm_file + ".gz", 'wb') as blast_result_out:
                blast_result_out.writelines(blast_result_in)
            os.remove(output_pssm_file)
示例#16
0
    selected.append(splited[0])
AC_BIN_dict = {}
for line in inf:
    line = line.strip('\n')
    splited = line.split('\t')
    AC_BIN_dict[splited[0]] = splited[1]
outf = open(outf_name, "w")
os.chdir(sequences_dir)
print "Current directory changed into " + sequences_dir + "!"
percent_count = 0
for item in selected:
    outf.write(item + "\t")
    #	cline = NcbipsiblastCommandline(query = item + ".fasta", db = "swissprot", max_hsps_per_subject = 1, num_threads = threads_count, outfmt = 0, out = "tmp.log")
    cline = NcbipsiblastCommandline(query=item + ".fasta",
                                    db="swissprot",
                                    num_threads=threads_count,
                                    outfmt=0,
                                    out=item + ".log")
    if percent_count % (len(selected) / 100) == 0:
        print str(percent_count / (len(selected) / 100)) + "% finished!"
        if percent_count == 0:
            print "Time elasped: Unknown"
            print "Time remaining: Unknown"
        else:
            m, s = divmod(time.time() - start_time, 60)
            h, m = divmod(m, 60)
            print "Time elasped: %d:%02d:%02d" % (h, m, s)
            del h, m, s
            m, s = divmod(
                (float(len(selected)) / percent_count) *
                (time.time() - start_time) - (time.time() - start_time), 60)
示例#17
0
    def get_pssm(self,
                 fasta_dir='fasta/',
                 outdir='pssm_raw/',
                 num_iterations=3,
                 run=True):
        """Compute the PSSM files

        Args:
            fasta_dir (str, optional): irectory where the fasta queries are stored
            blast (str, optional): path to the psiblast executable
            db (str, optional): path to the blast database
            outdir (str, optional): output directory where to store the pssm files
            num_iterations (int, optional): number of iterations for the blast calculations
        """

        fasta_dir = os.path.join(self.caseID, fasta_dir)
        outdir = os.path.join(self.caseID, outdir)
        if not os.path.isdir(outdir):
            os.mkdir(outdir)

        out_fmt = '7 qseqid qgi qacc qaccver qlen sseqid sallseqid sgi sallgi   ,\
                   sacc saccver sallacc slen qstart qend sstart send qseq sseq  ,\
                   evalue bitscore score length pident nident mismatch positive ,\
                   gapopen gaps ppos frames qframe sframe btop staxids stitle   ,\
                   salltitles sstrand qcovs qcovhsp qcovus'

        for q in os.listdir(fasta_dir):

            # get the fasta quey
            query = os.path.join(fasta_dir, q)
            name = os.path.splitext(os.path.basename(query))[0]

            # set up the output names
            out_ascii_pssm = os.path.join(outdir, name + '.pssm')
            out_pssm = os.path.join(outdir, name + '.cptpssm')
            out_xml = os.path.join(outdir, name + '.xml')

            # get the parameters
            blast_param = self._get_psiblast_parameters(query)

            # set up the psiblast calculation
            psi_cline = NcbipsiblastCommandline(
                self.blast,
                db=self.db,
                query=query,
                evalue=0.0001,
                word_size=blast_param['wordSize'],
                gapopen=blast_param['gapOpen'],
                gapextend=blast_param['gapExtend'],
                matrix=blast_param['scoringMatrix'],
                outfmt=7,  #out_fmt,
                comp_based_stats='T',
                max_target_seqs=2000,
                save_each_pssm=True,
                num_iterations=num_iterations,
                save_pssm_after_last_round=True,
                out_ascii_pssm=out_ascii_pssm,
                out_pssm=out_pssm,
                out=out_xml)

            # check that it's correct
            psi_cline._validate()

            if run:

                # run the blast query
                psi_cline()

                # copyt the final pssm to its final name
                shutil.copy2(out_ascii_pssm + '.%d' % num_iterations,
                             out_ascii_pssm)

                # remove all the other files
                for filename in glob.glob(out_pssm + '.*'):
                    os.remove(filename)
                for filename in glob.glob(out_ascii_pssm + '.*'):
                    os.remove(filename)
                os.remove(out_xml)
示例#18
0
from Bio.Blast.Applications import NcbipsiblastCommandline
import re
feature_num = 20
threads_count = 4
input_filename = "input.fasta"
output_filename = "test_features.npy"
inf_name = "../AC_BIN.dat"
inf = open(inf_name, "r")
AC_BIN_dict = {}
for line in inf:
    line = line.strip('\n')
    splited = line.split('\t')
    AC_BIN_dict[splited[0]] = splited[1]
cline = NcbipsiblastCommandline(query=input_filename,
                                db="swissprot",
                                num_threads=threads_count,
                                outfmt=0,
                                out="temp.log")
stdout, stderr = cline()
blastf = open("temp.log", "r")
outf = open(output_filename, "w")
AC = ""
expect = ""
count = 0
for line in blastf:
    line = line.strip("\n")
    matched = re.match(
        r'^sp\|(\w+)\.\w+\|\w+?\s+RecName:\s+.+?\s+[0-9]+(\.[0-9])?\s+([0-9]+.[0-9]+|[0-9]+e-[0-9]+)\s*$',
        line)
    if matched:
        if count < feature_num:
示例#19
0
    def searchPSIBLAST(self, id, psiblast):
        "Psi-Blast over a local database or over the internet"
        
        if psiblast == "local":
            threads = LP(self.parameterfile, "psiblast_threading")
            evalue = LP(self.parameterfile, "psiblast_evalue")
            reference_protein = "refseq_protein"
        
            in_sequence = self.dirname + id + ".fa"
            
            output = self.dirname + id + ".xml"
            if threads == False:
                psiblast = NcbipsiblastCommandline(query=in_sequence,
										 db=reference_protein,
										 outfmt=5,
										 threshold=evalue,
										 out=output) 
                psiblast()
            else:
                try:
                    threads = int(threads)
                    psiblast = NcbipsiblastCommandline(query=in_sequence,
                                         db=reference_protein,
                                         outfmt=5,
                                         threshold=evalue,
                                         out=output,
                                         num_threads=threads) 
                    psiblast()
                except: 
                    psiblast = NcbipsiblastCommandline(query=in_sequence,
                                         db=reference_protein,
                                         outfmt=5,
                                         threshold=evalue,
                                         out=output) 
                    psiblast()
            
            try:
                open(self.dirname + id + ".fasta")
                open.close()
                remove(self.dirname + id + ".fa")
            except: 
                move(self.dirname + id + ".fa", self.dirname + id + ".fasta")
        else:
            evalue = LP(self.parameterfile, "psiblast_evalue")
            reference_protein = "refseq_protein"
            
            in_sequence = self.dirname + id + ".fa"
                
            for seq_record in SeqIO.parse(in_sequence,
                                          "fasta", IUPAC.protein):
                sequence = seq_record.seq
        
                psiblast = NCBIWWW.qblast("blastp",
								    reference_protein,
								    sequence,
								    service="psi",
								    expect=evalue,
								    hitlist_size=500)
                psiblast
                
            try:
                open(self.dirname + id + ".fasta")
                open.close()
                remove(self.dirname + id + ".fa")
            except: 
                move(self.dirname + id + ".fa", self.dirname + id + ".fasta")

            output = self.dirname + id + ".xml"
            saveblast = open(output, "w")
            saveblast.write(psiblast.read())
            saveblast.close()
            psiblast.close()
示例#20
0
import pandas as pd
import sys

#~~~~~~~~~~~~~~~~~~~~~~~~~~
# DEFINING SOME VARIABLES:
#~~~~~~~~~~~~~~~~~~~~~~~~~~

query = sys.argv[1]
pssm = query + '-pssm.asn1'
output = 'psiblast-' + query + '.tsv'
out_format = " '6 qaccver saccver bitscore evalue qlen slen length qcovs pident staxids sscinames' "  # blastp output cols
out_columns = out_format[
    4:-2]  # gets only the output format specifiers in out_format
database = '~/datalocal/nr_db/nr'
e_cut = 1e-3

#~~~~~~~~~~~~~~~~~~~~~
# RUN 1 BLASTP ROUND:
#~~~~~~~~~~~~~~~~~~~~~

print('about to run')
# Run BLASTP in command line, according to above settings \
psiblast_cline = NcbipsiblastCommandline(in_pssm=pssm,
                                         num_iterations='1',
                                         out=output,
                                         outfmt=out_format,
                                         db=database,
                                         evalue=e_cut)
print(psiblast_cline)
print('psiblast done')
示例#21
0
import os, sys, glob, subprocess, shutil
from os.path import dirname, abspath, isdir
import pandas as pd
from Bio.Blast.Applications import NcbipsiblastCommandline
from Bio import SeqIO

#~~~~~~~~~~~~~~~~~~~~~~~~~~
# DEFINE NAMES OF FILES :
#~~~~~~~~~~~~~~~~~~~~~~~~~~

query = sys.argv[1]  # what the query sequence was ('PflA' or  'PflB')

parentdir = dirname(dirname(abspath(__file__)))
query_fa = parentdir + '/1_bidirectional-best-hits/Cj' + query + '.faa'
blastdb = query + '-db/' + query + '-hits'

pssm = query + '-pssm.asn1'

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CONVERT TO PSSM BY A SINGLE PSI-BLAST ITERATION:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

psiblast_cline = NcbipsiblastCommandline(
    query=query_fa, db=blastdb, num_iterations='2', out_pssm=pssm
)  # do one iteration of psiblast, take the original query (CjPflA or CjPflB) as the 'subject', and save the pssm it creates
stdout, stderr = psiblast_cline(
)  # make the PSSM with the original query and dummy database
示例#22
0
def blast(algo):
    seqindex = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa',
                           'fasta', key_function=lambda x: x.split()[0])
    hie = pickle.load(Path('data/train/scop40_1fold_hie.pkl').open('rb'))
    scop = Scop(dir_path='data/scop', version='1.75')
    tmpdir = Path(f'.{algo}')
    tmpdir.mkdir(exist_ok=True)
    auc_result = {}
    for sf in tqdm(hie):
        px_list = hie[sf]
        if len(px_list) < 1:
            continue
        sid = random.sample(px_list, 1)[0]
        record = seqindex[sid]
        f_fasta = tmpdir/f'{sid}.fasta'
        f_xml = tmpdir/f'{sid}.xml'
        SeqIO.write(record, f_fasta.as_posix(), 'fasta')
        try:
            if algo == 'psiblast':
                NcbipsiblastCommandline(query=f_fasta.as_posix(),
                                        db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75',
                                        num_threads=int(os.cpu_count()),
                                        num_iterations=3,
                                        evalue=999999,
                                        outfmt=5,
                                        out=f_xml.as_posix())()
            elif algo == 'deltablast':
                NcbideltablastCommandline(query=f_fasta.as_posix(),
                                          db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75',
                                          num_threads=int(os.cpu_count()),
                                          num_iterations=3,
                                          evalue=999999,
                                          outfmt=5,
                                          out=f_xml.as_posix())()
            else:
                raise ValueError(f'Invalid algorithm ({algo})')
        except ApplicationError as e:
            logging.error(e)
            f_xml.unlink()
            continue
        finally:
            f_fasta.unlink()
        results = SearchIO.parse(f_xml.as_posix(), 'blast-xml')
        results = list(results)[-1]
        results = list(results)[:500]
        sf_sccs = scop.getNodeBySunid(sf).sccs
        roc_score = []
        roc_label = []
        for result in results:
            result_sf_sccs = result.description.split(' ')[0][:-2]
            roc_score.append(-result.hsps[0].evalue)
            if result_sf_sccs == sf_sccs:
                roc_label.append(1)
            else:
                roc_label.append(0)
        if np.all(np.array(roc_label) == 1):
            auc = 1.0
        elif np.all(np.array(roc_label) == 0):
            auc = 0.0
        else:
            auc = metrics.roc_auc_score(roc_label, roc_score)
        auc_result[sf_sccs] = {'auc': auc, 'sample': sid, 'num': len(results)}
        f_xml.unlink()
    now = int(time.time())
    pickle.dump(auc_result, Path(f'auc_result_{algo}_{now}.pkl').open('wb'))
示例#23
0
    def searchPSIBLAST(self, id, psiblast):
        "Psi-Blast over a local database or over the internet"
        
        if psiblast == "local":
            # edit psiblast_evalue at Parameters.py
            threads = psiblast_threading
            evalue = psiblast_evalue
            reference_protein = "refseq_protein"
        
            in_sequence = "./Data/" + id + ".fa"
            
            output = "./Data/"+ id + ".xml"
            if threads == False:
                psiblast = NcbipsiblastCommandline(query=in_sequence, 
										 db=reference_protein, 
										 outfmt=5,  
										 threshold=evalue, 
										 out=output) 
                psiblast()
            else:
                try:
                    threads = int(threads)
                    psiblast = NcbipsiblastCommandline(query=in_sequence, 
                                         db=reference_protein, 
                                         outfmt=5,  
                                         threshold=evalue, 
                                         out=output,
                                         num_threads=threads) 
                    psiblast()
                except: 
                    psiblast = NcbipsiblastCommandline(query=in_sequence, 
                                         db=reference_protein, 
                                         outfmt=5,  
                                         threshold=evalue, 
                                         out=output) 
                    psiblast()
            
            try:
                open("./Data/" + id + ".fasta")
                open.close()
                remove("./Data/" + id + ".fa")
            except: 
                move("./Data/" + id + ".fa", "./Data/" + id + ".fasta")
        else:
            # edit psiblast_evalue at Parameters.py
            evalue = psiblast_evalue
            reference_protein = "refseq_protein"
            
            in_sequence = "./Data/" + id + ".fa"
                
            for seq_record in SeqIO.parse(in_sequence, 
                                          "fasta",IUPAC.protein):
                sequence = seq_record.seq
        
                psiblast = NCBIWWW.qblast("blastp", 
								    reference_protein, 
								    sequence, 
								    service="psi", 
								    expect=evalue,
								    hitlist_size=500)
                psiblast
                
            try:
                open("./Data/" + id + ".fasta")
                open.close()
                remove("./Data/" + id + ".fa")
            except: 
                move("./Data/" + id + ".fa", "./Data/" + id + ".fasta")

            output = "./Data/"+ id + ".xml"
            saveblast = open(output, "w")
            saveblast.write(psiblast.read())
            saveblast.close()
            psiblast.close()
def generate_profiles(in_dataframe, out_path):
    """Rather complicated and quite honetly ugly looking function used
    for generating the profiles from a given set of sequences. Intended to be used internally.
    """
    out_path = Path(out_path)
    dataset = in_dataframe
    s = Sultan()

    print('Unpacking and generating Uniprot DB.')
    s.gunzip('-fk ../data/swiss-prot/uniprot_sprot.fasta.gz').run()
    cmd = NcbimakeblastdbCommandline(
        input_file='../data/swiss-prot/uniprot_sprot.fasta', dbtype='prot')
    cmd()
    if not (out_path / 'profile').exists():
        s.mkdir(out_path / 'profile').run()

    with TemporaryDirectory() as psi_temp:
        for _, sample in tqdm(dataset.iterrows(),
                              total=len(dataset),
                              desc='Generating profiles'):
            with NamedTemporaryFile(mode='w') as blast_in:
                if isinstance(sample.name, tuple):
                    sample_id, chain = sample.name[0], sample.name[1]
                    out_name = f'{sample_id}_{chain}'
                    dump_path = out_path / 'full_test_summary.joblib'
                else:
                    sample_id = sample.name
                    out_name = sample_id
                    dump_path = out_path / 'jpred_summary.joblib'

                sequence, structure = sample[['Sequence', 'Structure']]
                structure = ' ' + structure
                print(f'>{out_name}', file=blast_in)
                print(sequence, file=blast_in)
                blast_in.seek(0)
                cmd = NcbipsiblastCommandline(
                    query=blast_in.name,
                    db='../data/swiss-prot/uniprot_sprot.fasta',
                    evalue=0.01,
                    num_iterations=3,
                    out_ascii_pssm=f'{psi_temp}/{out_name}.pssm',
                    num_descriptions=10000,
                    num_alignments=10000,
                    #  out=f'{psi_temp}{out_name}.alns.blast',
                    num_threads=8)
                cmd()

                if not os.path.exists(
                        os.path.join(psi_temp, out_name + '.pssm')):
                    tqdm.write(
                        f'Unable to generate profile for {out_name}. No hits in the database.'
                    )
                    dataset.drop(index=sample.name, inplace=True)
                    continue
                with open(f'{psi_temp}/{out_name}.pssm', 'r') as pssm_file:
                    pssm_file.readline()
                    pssm_file.readline()
                    profile = []
                    offset = False
                    position = 0
                    for line in pssm_file:
                        line = line.rstrip()
                        if not line:
                            break
                        line = line.split()
                        line.append(structure[position])
                        position += 1
                        if not offset:
                            for i in range(2):
                                line.insert(0, '')
                                offset = True
                        profile.append(line)
                    profile = pd.DataFrame(profile)
                    profile.drop(
                        (profile.columns[col] for col in range(2, 22)),
                        axis=1,
                        inplace=True)
                    profile.drop((profile.columns[-3:-1]),
                                 axis=1,
                                 inplace=True)
                    profile.drop((profile.columns[0]), axis=1, inplace=True)
                    profile.columns = profile.iloc[0]
                    profile = profile[1:]
                    profile.rename(columns={profile.columns[0]: "Sequence"},
                                   inplace=True)
                    profile.rename(columns={profile.columns[-1]: "Structure"},
                                   inplace=True)
                    profile = profile[
                        ['Structure'] +
                        [col for col in profile.columns if col != 'Structure']]
                    profile.loc[:, 'A':'V'] = profile.loc[:, 'A':'V'].astype(
                        float).divide(100)
                    profile.to_csv(out_path / 'profile' /
                                   (out_name + '.profile'),
                                   sep='\t',
                                   index=False)
    print(
        f'Dumping clean test to {dump_path}. Profiles are generated in {out_path}/profile'
    )
    dump(dataset, dump_path)
示例#25
0
def blast(query, algo, iter, key_name, roc_top, result_dict_cover,
          blast_ranking):
    tmpdir = Path(f'.{algo}_iter{iter}')
    tmpdir.mkdir(exist_ok=True)
    f_fasta = Path(f'data/test/{query}.fasta')
    f_xml = tmpdir / f'{query}.xml'
    if not f_xml.exists():
        if algo == 'psiblast':
            NcbipsiblastCommandline(
                query=f_fasta.as_posix(),
                db=
                'data/blastdb/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75_train_only',
                num_threads=int(os.cpu_count()),
                num_iterations=iter,
                max_target_seqs=500,
                evalue=999999,
                outfmt=5,
                out=f_xml.as_posix())()
        elif algo == 'deltablast':
            NcbideltablastCommandline(
                query=f_fasta.as_posix(),
                db=
                'data/blastdb/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75_train_only',
                rpsdb='data/blastdb/cdd_delta',
                num_threads=int(os.cpu_count()),
                num_iterations=iter,
                max_target_seqs=500,
                evalue=999999,
                outfmt=5,
                out=f_xml.as_posix())()
        elif algo == 'psiblast_pssm':
            f_pssm = Path(f'data/test/{query}.pssm')
            NcbipsiblastCommandline(
                in_pssm=f_pssm.as_posix(),
                db=
                'data/blastdb/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75_train_only',
                num_threads=int(os.cpu_count()),
                num_iterations=iter,
                max_target_seqs=500,
                evalue=999999,
                outfmt=5,
                out=f_xml.as_posix())()
        else:
            raise ValueError(f'Invalid algorithm ({algo})')
    results = SearchIO.parse(f_xml.as_posix(), 'blast-xml')
    results = list(results)[-1]  # final iteration
    results = list(results)
    sf_sccs = scop100.getDomainBySid(query).getAscendent('sf').sccs
    all_tp_count = len(
        hie[scop100.getDomainBySid(query).getAscendent('sf').sunid])
    roc_score, roc_label = [], []
    assert len(results) >= roc_top
    blast_ranking[key_name][query] = []
    for result in results:
        blast_ranking[key_name][query].append(
            {result.id: ('', '', result.hsps[0].evalue)})
    for result in results:
        result_sf_sccs = '.'.join(
            result.description.split(' ')[0].split('.')[:-1])
        roc_score.append(-result.hsps[0].evalue)
        label = 1 if result_sf_sccs == sf_sccs else 0
        roc_label.append(label)
        if label == 1:
            result_dict_cover[key_name].append(
                len(result.hsps[0].query.seq.ungap('-')) /
                len(SeqIO.read(f_fasta.as_posix(), 'fasta').seq))
        if roc_label.count(0) == roc_top:
            break
    fpc, tpc = [0], [0]
    for i in range(0, roc_top):
        fpc.append(i)
        tpc.append(roc_label[:i + 1].count(1))
    auc = metrics.auc(fpc, tpc) / roc_top / all_tp_count
    return fpc, tpc, auc
示例#26
0
2  # -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

from Bio.Blast.Applications import NcbipsiblastCommandline
from Bio import SeqIO
import os

inputfile = 'input.fasta'
for seq_record in SeqIO.parse('PDNA-224.fasta', 'fasta'):
    print('{} is calculating pssm'.format(seq_record.id))
    if os.path.exists(inputfile):
        os.remove(inputfile)
    pssmfile = "".join(('pssm', '_', seq_record.id, '.txt'))
    SeqIO.write(seq_record, inputfile, 'fasta')
    psiblast_cline = NcbipsiblastCommandline(query=inputfile,
                                             db='swissprot',
                                             evalue=0.001,
                                             num_iterations=3,
                                             out_ascii_pssm=pssmfile)
    stdout, stderr = psiblast_cline()
示例#27
0
                    if REMOTE:
                        db = BLAST_CONFIG['default_db']
                        kwargs = {}
                    else:
                        db = config['paths']['blast']['dbs'][
                            BLAST_CONFIG['default_db']]
                        kwargs = {
                            'num_iterations': BLAST_CONFIG['num_iterations'],
                            'num_threads': BLAST_CONFIG['num_threads']
                        }

                    cline = NcbipsiblastCommandline(
                        cmd=os.path.join(BLAST_PATH, 'bin', 'psiblast'),
                        query=in_file.name,
                        db=db,
                        num_alignments=1,
                        out_ascii_pssm=out_file_name,
                        remote=REMOTE,
                        **kwargs)
                    print(cline)
                    stdout, stderr = cline()

            # Note: It's possible that out_file_name doesn't get generated at all, if there are no hits found.
            # In this case, generate an empty .pssm file so we don't try to regenerate this file the next time this
            # gene/protein is encountered. We'll also have to check for 0 sized files to make this work of course.
            open(out_file_name, 'a').close(
            )  # no-op if file exists, creates an empty file if it doesn't

            # Copy all 'valid' .pssm files (with size>0) to the output folder for downstream processing.
            if os.path.getsize(out_file_name) > 0:
                shutil.copyfile(out_file_name,
示例#28
0
def __generate_sequence_profiles_old():
    mtx_dir_name = 'pssm_deltablast'
    DB_INDEX = SeqIO.index('data/scop40_structural_alignment.fasta', 'fasta')
    records = {}
    for i in DB_INDEX:
        domkey = i.split('&')[0]
        records[domkey] = SeqRecord(DB_INDEX[i].seq.ungap('-'),
                                    id=domkey,
                                    name='',
                                    description='')
    with Path('data/scop40_scopdom_pdbatom_seq.fasta').open('w') as f:
        SeqIO.write(records.values(), f, 'fasta')

    DB_INDEX = SeqIO.index('data/scop40_scopdom_pdbatom_seq.fasta', 'fasta')
    for sid in tqdm(list(DB_INDEX)):
        mtx_dir = Path(f'data/{mtx_dir_name}/{sid[2:4]}')
        mtx_dir.mkdir(exist_ok=True, parents=True)
        mtx_file = mtx_dir / f'{sid}.mtx'
        if mtx_file.exists():
            logging.debug(f'PSSM already exists: {mtx_file}')
            continue
        try:
            SeqIO.write(DB_INDEX[sid], f'{sid}.fasta', 'fasta')
            NcbipsiblastCommandline(query=f'{sid}.fasta',
                                    db='uniref90',
                                    num_threads=int(os.cpu_count()),
                                    num_iterations=3,
                                    out_ascii_pssm=mtx_file.as_posix(),
                                    save_pssm_after_last_round=True)()
        except Exception as e:
            logging.exception(e)
            continue
        finally:
            if Path(f'{sid}.fasta').exists():
                Path(f'{sid}.fasta').unlink()

    logging.info('')
    for sid in tqdm(
            pickle.load(Path('data/one_domain_superfamily.pkl').open('rb'))):
        mtx_dir = Path(f'data/{mtx_dir_name}/{sid[2:4]}')
        mtx_dir.mkdir(exist_ok=True, parents=True)
        mtx_file = mtx_dir / f'{sid}.mtx'
        if mtx_file.exists():
            logging.debug(f'PSSM already exists: {mtx_file}')
            continue
        try:
            tmalign = TMalignCommandLine(f'data/scop_e/{sid[2:4]}/{sid}.ent',
                                         f'data/scop_e/{sid[2:4]}/{sid}.ent')
            tmalign.run()
            assert str(tmalign.alignment[0].seq).find('-') == -1
            SeqIO.write(tmalign.alignment[0], f'{sid}.fasta', 'fasta')
            NcbipsiblastCommandline(query=f'{sid}.fasta',
                                    db='uniref90',
                                    num_threads=int(os.cpu_count()),
                                    num_iterations=3,
                                    out_ascii_pssm=mtx_file.as_posix(),
                                    save_pssm_after_last_round=True)()
        except Exception as e:
            logging.error(f'sid={sid}')
            logging.exception(e)
            continue
        finally:
            if Path(f'{sid}.fasta').exists():
                Path(f'{sid}.fasta').unlink()