Пример #1
0
def BLAST(query, subject, debug=False):
    seq1 = SeqRecord(Seq(query), id="seq1")
    seq2 = SeqRecord(Seq(subject), id="seq2")
    SeqIO.write(seq1, "seq1.fasta", "fasta")
    SeqIO.write(seq2, "seq2.fasta", "fasta")
    # Run BLAST and parse the output as XML
    if len(query) < 200:
        output = NcbiblastnCommandline(query="seq1.fasta",
                                       subject="seq2.fasta",
                                       outfmt=5,
                                       max_target_seqs=1,
                                       task="blastn-short")()[0]
    else:
        output = NcbiblastnCommandline(query="seq1.fasta",
                                       subject="seq2.fasta",
                                       outfmt=5,
                                       max_target_seqs=1)()[0]
    result = NCBIXML.read(StringIO(output))
    qstart, qstop, sstart, sstop = (-9, -9, -9, -9)
    for Alignment in result.alignments:
        for i, hsp in enumerate(Alignment.hsps):
            if i == 0:  # Only going to look at the first (best) match
                qstart = min(
                    hsp.query_start, hsp.query_end
                )  # Account for fact that alignment could have various orientations
                qstop = max(hsp.query_start, hsp.query_end)
                sstart = min(hsp.sbjct_start, hsp.sbjct_end)
                sstop = max(hsp.sbjct_start, hsp.sbjct_end)
                if debug:
                    pdb.set_trace()
            else:
                break
    return (qstart, qstop, sstart, sstop)
Пример #2
0
def blastProcess(threadID, filebase, db, outbase, wordSize, hits=10, constant=False):

	fasta  = filebase % threadID
	output = outbase  % threadID

	print( "Starting blast of %s against %s..." % (fasta, db) )

	if os.path.isfile(db + ".nhr"):
		if constant:
			cline = NcbiblastnCommandline(blast_cmd, query=fasta, db=db, out=output,
						      outfmt="\'6 qseqid sseqid pident length mismatch gaps qstart qend sstart send evalue bitscore sstrand\'",
						      gapopen=5, gapextend=2, penalty=-1, reward=1, evalue=1e-3, max_target_seqs=hits, word_size=wordSize, perc_identity=100)
		else:
			cline = NcbiblastnCommandline(blast_cmd, query=fasta, db=db, out=output,
						      outfmt="\'6 qseqid sseqid pident length mismatch gaps qstart qend sstart send evalue bitscore sstrand\'",
						      gapopen=5, gapextend=2, penalty=-1, reward=1, evalue=1e-3, max_target_seqs=hits, word_size=wordSize)
	else:
		if constant:
			cline = NcbiblastnCommandline(blast_cmd, query=fasta, subject=db, out=output,
						      outfmt="\'6 qseqid sseqid pident length mismatch gaps qstart qend sstart send evalue bitscore sstrand\'",
						      gapopen=5, gapextend=2, penalty=-1, reward=1, evalue=1e-3, max_target_seqs=hits, word_size=wordSize, perc_identity=100)
		else:
			cline = NcbiblastnCommandline(blast_cmd, query=fasta, subject=db, out=output,
						      outfmt="\'6 qseqid sseqid pident length mismatch gaps qstart qend sstart send evalue bitscore sstrand\'",
						      gapopen=5, gapextend=2, penalty=-1, reward=1, evalue=1e-3, max_target_seqs=hits, word_size=wordSize)

			
	try:
		cline()
	except:
	       print( traceback.format_exc() )
Пример #3
0
    def run(self):
        global finishcount
        global totalthreads
        finished = 'F'
        while True:
            if finished == 'T': break
            self.counter = -1
            for lock in self.locks:
                self.counter += 1
                if finished == 'T': break
                if not lock.locked():
                    with lock:
                        tempfile = ""
                        if self.genome == "t":
                            tempfile = "Output/AsAlReads/MappedToAt/AtReads_" + str(
                                self.counter) + ".fasta"
                        elif self.genome == "l":
                            tempfile = "Output/AsAlReads/MappedToAl/AlReads_" + str(
                                self.counter) + ".fasta"
                        with open(tempfile, 'w') as outfile:
                            outline = ''.join(self.reads)
                            outfile.write(outline)

                        if self.genome == "t":
                            #self.blastn_cline = NcbiblastnCommandline(cmd='blastn', query=str(tempfile), db="at9db", evalue=0.0001, word_size=9, outfmt=5, out=str(self.outfile)) # For my computer (Windows)
                            self.blastn_cline = NcbiblastnCommandline(
                                cmd='/share/apps/blast+/bin/blastn',
                                query=str(tempfile),
                                db="../at9db",
                                evalue=0.0001,
                                word_size=9,
                                outfmt=5,
                                out=str(self.outfile))  # For Linux server
                        elif self.genome == "l":
                            #self.blastn_cline = NcbiblastnCommandline(cmd='blastn', query=str(tempfile), db="lyratadb", evalue=0.0001, word_size=9, outfmt=5, out=str(self.outfile)) # For my computer (Windows)
                            self.blastn_cline = NcbiblastnCommandline(
                                cmd='/share/apps/blast+/bin/blastn',
                                query=str(tempfile),
                                db="../lyratadb",
                                evalue=0.0001,
                                word_size=9,
                                outfmt=5,
                                out=str(self.outfile))  # For Linux server
                        print(self.blastn_cline)
                        stdout, stderr = self.blastn_cline()
                        if len(stdout) > 0 or len(stderr) > 0:
                            print(stdout, stderr, sep="\n")
                        while True:
                            try:
                                os.remove(tempfile)
                                break
                            except:
                                pass
                        finished = 'T'
            time.sleep(1)
        finishcount += 1
        percentdone = round(finishcount / totalthreads * 100, 1)
        print(str(percentdone) + "% done\n")
Пример #4
0
def query_blast(fa_path,
                DATA_DIR_PATH,
                OUTPUT_PATH,
                DB_NAME,
                DB_PATH,
                evalue=0.001,
                outfmt=5):
    """Uses the Ncbiblastn to blast the blast DB with the query file

        Returns the path of the result.
        """

    print("Querying the blast db...")

    OUT_FILE_NAME = os.path.basename(os.path.splitext(fa_path)[0] + ".xml")
    OUT = os.path.join(OUTPUT_PATH, OUT_FILE_NAME)

    cline = NcbiblastnCommandline(cmd='blastn',
                                  query=fa_path,
                                  db=DB_PATH,
                                  evalue=evalue,
                                  outfmt=outfmt,
                                  out=OUT)

    cline()

    # Return path and name of the query's result
    query = {"result_path": OUT, "result_name": OUT_FILE_NAME}
    return (query)
Пример #5
0
def BLAST(fragmentFile, sequenceFile, outputName):
    coords = []
    sequenceFile, runStatus = createBLASTdb(sequenceFile)
    if runStatus == 0:
        NcbiblastnCommandline(cmd='blastn',
                              out=outputName,
                              outfmt=5,
                              query=fragmentFile,
                              strand="both",
                              dust="no",
                              db=sequenceFile,
                              evalue=0.001,
                              word_size=7)()  #reward=1,penalty=-3,
        results = NCBIXML.parse(open(outputName, 'r'))
        coords = []
        for result in results:
            for alignment in result.alignments:
                for hsp in alignment.hsps:
                    if hsp.strand[0] is not None:
                        print hsp.strand, "WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW"
                    #                    print hsp
                    #                    if hsp.expect < 0.00005:
                    coords.append(
                        Coordinate(hsp.sbjct_start - 1, hsp.sbjct_end))


#                    coords.append([hsp.sbjct_start-1,hsp.sbjct_end])
        coords.sort(key=lambda x: x.start)
        return coords
Пример #6
0
def thaliana_blast_speedtest():
    # Used just for testing purposes to see how quickly BLAST will take given x number of query sequences in a batch.
    # Time/memory required for BLAST to finish increases quadratically with x. 20,000 sequences per batch appears to be
    # the highest reasonable amount achievable while the time taken to finish is still approximately linear.
    # Hence, parallel BLASTing of small 20,000 sequence batches is used in this script.
    filename = "Output/AlReadsAFew.fasta"  # Contains 100,000 or so sequences
    filename2 = "Output/AlReadsSpeedTest.fasta"  # Output containing time taken to BLAST x number of sequences
    linelist = []
    times = []
    with open(filename) as infile:
        linelist = infile.readlines()
    maxtime = 0
    maxlines = 0
    while maxtime < 3600:  # Continue BLASTing increasingly larger batches until the time a BLAST batch takes exceeds an hour
        #maxlines += 1000
        maxlines += 20000
        with open(filename2, 'w') as outfile:
            outline = ''.join(linelist[0:maxlines - 1])
            outfile.write(outline)
        start = time.time()
        blastn_cline = NcbiblastnCommandline(
            query=filename2,
            db="at9db",
            evalue=0.0001,
            outfmt=5,
            out="Output/AsAlReadsMappedToAt-Test.xml")
        stdout, stderr = blastn_cline()
        print(stdout, stderr, sep="\n")
        total = time.time()
        maxtime = total - start
        outstr = str(maxlines) + ": " + str(total) + "\n"
        times.append(outstr)
        print(maxlines, "done")
    print(times)
    sys.exit(0)
Пример #7
0
 def blast(self):
     """
     Run BLAST analyses of the subsampled FASTQ reads against the NCBI 16S reference database
     """
     printtime('BLASTing FASTA files against {} database'.format(
         self.analysistype),
               self.starttime,
               output=self.portallog)
     for _ in range(self.cpus):
         threads = Thread(target=self.blastthreads, args=())
         threads.setDaemon(True)
         threads.start()
     for sample in self.runmetadata.samples:
         if sample.general.bestassemblyfile != 'NA':
             # Set the name of the BLAST report
             sample[self.analysistype].blastreport = os.path.join(
                 sample[self.analysistype].outputdir,
                 '{}_{}_blastresults.csv'.format(sample.name,
                                                 self.analysistype))
             # Use the NCBI BLASTn command line wrapper module from BioPython to set the parameters of the search
             blastn = NcbiblastnCommandline(
                 query=sample[self.analysistype].fasta,
                 db=os.path.splitext(sample[self.analysistype].baitfile)[0],
                 max_target_seqs=1,
                 num_threads=self.threads,
                 outfmt="'6 qseqid sseqid positive mismatch gaps "
                 "evalue bitscore slen length qstart qend qseq sstart send sseq'",
                 out=sample[self.analysistype].blastreport)
             # Add a string of the command to the metadata object
             sample[self.analysistype].blastcall = str(blastn)
             # Add the object and the command to the BLAST queue
             self.blastqueue.put((sample, blastn))
     self.blastqueue.join()
Пример #8
0
def blast_search(bconf, unique_sequence, database_name, temppath, pseudopath,
                 unique):
    """Run BLASTN to find sequences within the pseudoscaffold"""
    #   Change do directory containing pseudoscaffold
    os.chdir(pseudopath)
    #   Are we lacking an outfile for BLAST results?
    if bconf.get('outfile') == None:
        blast_out = temppath + '/' + unique + '_temp.xml'
    #   Nope, we got
    else:
        blast_out = bconf['outfile']
    #   Where is the query file?
    unique_query = temppath + '/' + unique_sequence
    #   Define the BLAST search
    blastn_cline = NcbiblastnCommandline(query=unique_query,
                                         db=database_name,
                                         evalue=0.05,
                                         max_target_seqs=1,
                                         outfmt=5,
                                         out=blast_out)
    print("Running BLAST search")
    #   Run the BLAST search
    blastn_cline()
    print("Finished searching")
    os.chdir(temppath)
    return (blast_out)
Пример #9
0
def gfg(): 
    if request.method == "POST": 
        # getting input with name = seq in HTML form 
        
        ip_sequence = request.form.get("seq")
        ip_type = request.form.get("ip_type")
        blast_type = request.form.get("blast_type")
        database_type = request.form.get("database")
        my_blast_db = request.form.get("db_typeo")
        e_value_thresh = request.form.get("evalue")
        e_value_thresh = float(e_value_thresh)
        
        #default e-value
        if e_value_thresh=="":
            e_value_thresh=0.05
        
    
        if ip_type =="fastq":
            seq_id = ip_sequence.split("\n")[0] #sequence id only
            seq_fasta = "".join(ip_sequence.split("\n")[1]) #gives only sequence
            
            fasta_seq= seq_id + "\n" + seq_fasta
            
        elif ip_type =="fasta":
            seq_id = ip_sequence.split("\n")[0]
            seq_fasta = ip_sequence.split("\n")[1]
            fasta_seq= "\n".join(ip_sequence.split("\n")[1:])
            
        if my_blast_db=="":
            print("1")
            #blast over internet
            result_handle=NCBIWWW.qblast(blast_type, database_type, fasta_seq)
            with open("outputhtml.xml", "w") as save_to:
                save_to.write(result_handle.read())
                result_handle.close()
        else:
            #local blast
            #if loop for each blast type:
            if blast_type=="blastn":
                result_handle=NcbiblastnCommandline(cmd=blast_type, query=fasta_seq, db=my_blast_db, evalue=e_value_thresh, out="outputhtml.xml")
            elif blast_type=="blastp":
                result_handle=NcbiblastpCommandline(cmd=blast_type, query=fasta_seq, db=my_blast_db, evalue=e_value_thresh, out="outputhtml.xml")
            elif blast_type=="blastx":
                result_handle=NcbiblastxCommandline(cmd=blast_type, query=fasta_seq, db=my_blast_db, evalue=e_value_thresh, out="outputhtml.xml")
            elif blast_type=="tblastx":
                result_handle=NcbitblastxCommandline(cmd=blast_type, query=fasta_seq, db=my_blast_db, evalue=e_value_thresh, out="outputhtml.xml")
            elif blast_type=="tblastn":
                result_handle=NcbitblastnCommandline(cmd=blast_type, query=fasta_seq, db=my_blast_db, evalue=e_value_thresh, out="outputhtml.xml")
        #blast parsing
        blast_records = NCBIXML.parse(result_handle)
        
        with open("outputhtml.xml") as f:
            blast_records = NCBIXML.parse(f)
            blast_record = list(blast_records)[0]
        
        return render_template("output.html", 
                                blast_record=blast_record, 
                                e_value_threshold=e_value_thresh)
    
    return render_template("input.html") 
Пример #10
0
def crisprSingle(item, query_virus_dir, output_dir, numThreads):
    query_name = item.split('.')[0]
    query_file = os.path.join(query_virus_dir, item)
    output_file = os.path.join(output_dir, query_name) + '.crispr'
    crispr_call = NcbiblastnCommandline(query=query_file,db=db_host_crispr_prefix,out=output_file,outfmt="6 qacc sacc evalue", evalue=1,gapopen=10,penalty=-1,
                                  gapextend=2,word_size=7,dust='no',
                                 task='blastn-short',perc_identity=90,num_threads=numThreads)
    crispr_call()
    '''
    Parse blast results
    '''
    if os.stat(output_file).st_size == 0:
        ind = False
        return ind, None
    else:
        query_res = pd.read_table(output_file,header = None)
        # Sanity check for blastn output format 
        query_res = query_res[query_res[1].apply(lambda x: x.count("|")) == 2]
        if query_res.shape[0] == 0:
            return False, None
        query_res[0] = query_name
        query_res[1] = query_res[1].apply(lambda x: x.split('|')[-2])
        #query_res[1] = [dict_genome[k] for k in list(query_res[1])]
        query_res[2] = -query_res[2].apply(math.log)
        df_crispr = query_res.groupby([0,1]).max().unstack(fill_value=0)
        ind = True
        return ind, df_crispr.set_index([[query_name]])
Пример #11
0
def blastn(blast_cmd, query, db, blastout, threads, max_hsps, max_target_seqs,
           perc_qcov, perc_ident):
    evalue = 1e-5
    blast_fmt = "'6 qseqid stitle pident length qcovhsp qlen slen qstart qend sstart send evalue bitscore'"
    if db == 'Prophage':
        blast_db = db_prophage
        blast_database = 'PHASTER'
    elif db == 'Plasmid':
        blast_db = db_plasmid
        blast_database = 'Refseq_plasmid'
    blastn_out = NcbiblastnCommandline(cmd=blast_cmd,
                                       query=query,
                                       db=blast_db,
                                       evalue=evalue,
                                       outfmt=blast_fmt,
                                       out=blastout,
                                       num_threads=threads,
                                       max_hsps=max_hsps,
                                       max_target_seqs=max_target_seqs,
                                       qcov_hsp_perc=perc_qcov,
                                       perc_identity=perc_ident)
    stdout, stderr = blastn_out()
    if os.path.getsize(blastout) > 0:
        blast_result = open(blastout, 'r').readline()
        items_blast = blast_result.split('\t')
        blast_subject = items_blast[1]
        blast_ident = items_blast[2]
        blast_length = items_blast[3]
        blast_cov = items_blast[4]
        blast_slength = items_blast[6]
    else:
        blast_database = blast_subject = blast_ident = blast_length = blast_cov = blast_slength = 'NA'
    return blast_database, blast_subject, blast_ident, blast_length, blast_cov, blast_slength
def locate(genome, gene_db):
    prefix = generate_string()
    query_fn = '{}.query.fasta'.format(prefix)
    sbjct_fn = '{}.sbjct.fasta'.format(prefix)
    SeqIO.write(genome, query_fn, 'fasta')
    SeqIO.write(gene_db, sbjct_fn, 'fasta')
    blastn_cline = NcbiblastnCommandline(
        query=query_fn,
        subject=sbjct_fn,
        evalue=1e-10,
        outfmt="'6 qseqid qstart qend length'")
    stdout, stderr = blastn_cline()
    csv_reader = csv.reader(stdout.splitlines(), delimiter='\t')
    intervals = []
    for row in csv_reader:
        if int(row[-1]) > 100:
            intervals.append([row[0], int(row[1]) - 1, int(row[2])])
        else:
            pass
    clean(prefix)
    if intervals:
        return pybedtools.BedTool(intervals).sort().merge(
            d=-100)  # deal with overlaps no more than 100 bp
    else:
        return None
def Runs_local_BLASTn_search(My_Query, BLAST_db_path, BLAST_db_folder_name,
                             BLAST_db_name):
    Puts_the_query_in_a_text_doc_so_BLAST_can_use_it(My_Query, BLAST_db_path)

    # Changes_backslashes_to_forward_slashes function creates
    # strings to input into the Ncbiblast Commandline biopython module.
    BLAST_db_path_no_backslash = Changes_backslashes_to_forward_slashes(
        BLAST_db_path)
    BLAST_db_name_no_backslash = Changes_backslashes_to_forward_slashes(
        BLAST_db_name)

    blastncline = NcbiblastnCommandline(
        query=(BLAST_db_path_no_backslash + "/" + "temporary_file.txt"),
        db=(BLAST_db_path_no_backslash + "/" + BLAST_db_folder_name + "/" +
            BLAST_db_name_no_backslash),
        out=(BLAST_db_path_no_backslash + "/my_xml.xml"))
    #NcbiblastnCommandline doesn't like python variables, only strings.  The only input is temporary_file.txt anyway
    #NcbiblastnCommandline also doesn't like spaces inside the file pathways
    #  NcbiblastnCommandline doesn't like backslashes in the file pathways (\).
    # Be sure to change all basckslashes to forwardslashes prior to using NcbiblastnCommandline.
    blastncline()

    Deletes_the_temporary_file(BLAST_db_path)  #Deletes the temporary text file

    return
Пример #14
0
def blast(FastaFile, BlastDB, perID, SeqLen):
    '''
    libraries:
        from Bio import SeqIO
        from Bio.Blast.Applications import NcbiblastnCommandline
        from Bio.Blast import NCBIXML
    '''
    Fasta_Handle = open(FastaFile, "r")

    for Record in SeqIO.parse(Fasta_Handle, "fasta"):
        #generate temporary fasta file input, and BLASTxml output
        TempFasta = tempfile.NamedTemporaryFile()
        TempFasta.write(">%s\n%s\n" % (Record.id, Record.seq))
        TempBlastXML = tempfile.NamedTemporaryFile()

        #BLAST Record
        Blast_Command = NcbiblastnCommandline(query=TempFasta.name,
                                              db=BlastDB,
                                              evalue=1e-10,
                                              out=TempBlastXML.name,
                                              outfmt=5)
        std_output, err_output = Blast_Command()
        TempFasta.close()
        Result_Handle = open(TempBlastXML.name)
        Blast_Records = NCBIXML.parse(Result_Handle)

        #lists
        UpList = []
        DownList = []
        FullSeqList = []
        Headers = []

        #loop over Records, check perID
        for Blast_Record in Blast_Records:
            for Alignment in Blast_Record.alignments:
                for Hsp in Alignment.hsps:
                    Hsp_perID = (
                        (float(Hsp.positives) / float(Hsp.align_length)) * 100)
                    if Hsp_perID >= int(perID):
                        #call seq function
                        UpStream, DownStream, FullSeq = parse_seq(
                            Blast_Record, Hsp, Record, SeqLen)
                        #create list of seqs and Headers
                        UpList.append(UpStream)
                        DownList.append(DownStream)
                        FullSeqList.append(FullSeq)
                        #create header
                        Sbjct_Name = Alignment.title
                        Sbjct_Edit = Sbjct_Name.replace(
                            ' No definition line', '')
                        Header_String = (str(Sbjct_Edit) + '|' +
                                         str(round(Hsp_perID, 1)) + '%' + '|' +
                                         str(Hsp.query_start))
                        Headers.append(Header_String)

        #print out and close
        print_out(UpList, Headers, Record.id, '_upstream.fasta')
        print_out(DownList, Headers, Record.id, '_downstream.fasta')
        print_out(FullSeqList, Headers, Record.id, '_fullseq.fasta')
        Result_Handle.close()
Пример #15
0
def blast_func(samplecfg):
    blastlog = ['BLAST']
    # create index if does not exist
    if not (os.path.exists(samplecfg.genome + '.nin')
            and os.path.exists(samplecfg.genome + '.nhr')
            and os.path.exists(samplecfg.genome + '.nsq')):
        makedb = NcbimakeblastdbCommandline(cmd='makeblastdb',
                                            dbtype='nucl',
                                            input_file=samplecfg.genome)
        stdout, stderr = makedb()
        blastlog.append('\n\nNcbimakeblastdb\n\n')
        blastlog.append(stdout)
        blastlog.append(stderr)
    # blast
    sampleblast = NcbiblastnCommandline(
        task='blastn',
        query=samplecfg.fasta,
        db=samplecfg.genome,
        outfmt=samplecfg.view,
        evalue=samplecfg.evalue,
        out=mydir + '/blast_' + samplecfg.sample + '.' + samplecfg.suffix +
        '.txt')
    stdout, stderr = sampleblast()
    blastlog.append('\n\nNcbiblastn\n\n')
    blastlog.append(stdout)
    blastlog.append(stderr)
    return blastlog
Пример #16
0
def run_blastn(query, database, evalue, max_seqs, max_hsps):
    """Run BLASTn"""
    try:
        assert isinstance(query, str)
        assert isinstance(database, str)
        assert isinstance(evalue, float)
        assert isinstance(max_seqs, int)
        assert isinstance(max_hsps, int)
    except AssertionError:
        raise TypeError
    #   Create an output name
    query_base = os.path.basename(os.path.splitext(query)[0])
    database_base = os.path.basename(os.path.splitext(database)[0])
    blast_out = os.getcwd(
    ) + '/' + query_base + '_' + database_base + '_BLAST.xml'
    try:
        validate_db(database)
    except FileNotFoundError as error:
        sys.exit(error)
    #   Setup BLASTn
    blastn = NcbiblastnCommandline(query=query,
                                   db=database,
                                   evalue=evalue,
                                   outfmt=5,
                                   max_target_seqs=max_seqs,
                                   max_hsps=max_hsps,
                                   out=blast_out)
    #   Run BLASTn
    print(blastn, file=sys.stderr)
    blastn()
    if not os.path.exists(blast_out):
        raise BLASTFailedError
    return blast_out
def Blast_seq(mirna):

    with open('mirna.fasta', 'w+') as f:
        f.write('>' + 'refseq_1' + '\n' + str(mirna))
    if os.path.isfile('blast_result.csv'):
        os.remove('blast_result.csv')
    blastx_cline = NcbiblastnCommandline(query='mirna.fasta',
                                         db="human_mirna",
                                         evalue=0.1,
                                         outfmt=10,
                                         out="blast_result.csv",
                                         word_size=7,
                                         gapopen=5,
                                         gapextend=2,
                                         strand='both')
    stdout, stderr = blastx_cline()
    list_of_mirna = []
    try:
        with open('blast_result.csv', 'r+') as f:
            lines = f.read()
            if '\n' in lines:
                lines = lines.split('\n')

            for line in lines:
                if ',' in line:
                    list_of_mirna.append(line.split(',')[1])
        if len(list_of_mirna) > 0:

            return list_of_mirna
        else:
            return None
    except:
        return None
Пример #18
0
def get_homology_count(query_seq_list, db='TAIR10_Whole_Genome', word_size=11, gap_open=5, gape_extend=2,
                       reward=2, penalty=-3, num_threads=core) -> list:
    if not check_usable():
        return ['Cannot be used'] * len(query_seq_list)
    if not os.path.exists(os.path.join(db_file_path, db + '.nin')) or \
            not os.path.exists(os.path.join(db_file_path, db + '.nhr')):
        is_created = create_db(db_name=db)
        if not is_created:
            from backend.utils.log import custom_logger
            custom_logger.error({
                'action': 'create_db',
                'status': 'failed'
            })
            raise
    db_path = os.path.join(db_file_path, db)
    homology_list = []
    for query_seq in query_seq_list:
        write_fasta(query_seq)
        out = NcbiblastnCommandline(db=db_path, query=query_path, word_size=word_size, gapopen=gap_open,
                                    gapextend=gape_extend, reward=reward, penalty=penalty, outfmt=6,
                                    num_threads=num_threads)()[0]

        number_of_homology = len(out.splitlines())
        homology_list.append(number_of_homology)
    return homology_list
Пример #19
0
    def query(self, query_dir: Path, config: dict, blast_format: str,
              headers: tuple) -> pd.DataFrame:
        """This function queries content of the directory to created database

        Args:
            query_dir (Path): Directory containing query files.
            config (dict): Blast configuration dict.
            blast_format (str): Blast output format.
            headers ( tuple(*str) ): Headers matching blast output for final DataFrame.
        Raises:
            TypeError: When given obj is of wrong type.
            FileNotFoundError: When given path does not exist or when given path is not a directory.
            ValueError: When forbidden blast option was provided.
        Returns:
            (pd.DataFrame): Pandas DataFrame containing query results.
        """

        if not isinstance(query_dir, Path):
            raise TypeError("Given object is not Path object")
        if not query_dir.exists():
            raise FileNotFoundError("Given path does not exist")
        if not query_dir.is_dir():
            raise FileNotFoundError("Given path is not directory")

        if not isinstance(config, dict):
            raise TypeError("Config file is not a dict object")
        if any(kwarg in ('query', 'db', 'outfmt', 'max_target_seqs',
                         'num_alignments') for kwarg in config.keys()):
            used = filter(
                lambda k: k in config.keys(),
                ('query', 'db', 'outfmt', 'max_target_seqs', 'num_alignments'))
            raise ValueError(
                "Given kwargs are not valid in terms of blast usage",
                list(used))

        self._aggregate(query_dir, Path("blast_query.fasta"))
        try:
            cmd = NcbiblastnCommandline(query="blast_query.fasta",
                                        db=f"{self.name}",
                                        outfmt=blast_format,
                                        **config)
            blastn_output = subprocess.run(str(cmd),
                                           capture_output=True,
                                           shell=True)

            # Error only occurs if it's not this stupid warning.
            if blastn_output.stderr and "Examining 5 or more matches" not in blastn_output.stderr.decode(
            ):
                raise subprocess.SubprocessError(
                    f"Blastn returned error: {blastn_output.stderr.decode()}")
        except Exception:
            raise
        finally:
            if Path("blast_query.fasta").exists():
                Path("blast_query.fasta").unlink()
        results_df: pd.DataFrame = pd.read_csv(io.StringIO(
            blastn_output.stdout.decode()),
                                               header=None,
                                               names=headers)
        return results_df
Пример #20
0
 def align(self, query, queryName):
     self.alignPath = self.outputPath + "/"+queryName
     self.createFolder(self.alignPath)
     print("alignPath "+self.alignPath)
     print("query "+query)
     print("queryName "+queryName)
     i=0
     print("dbpath es "+self.dbPath)
     for bases, dirs, files in os.walk(self.dbPath):
         for file in files:
             # fileName es "secuencias.fasta" o salida.fasta
             fileName = self.dbName + "." + self.outputFormat
             fileArray = file.split('.')
             fileName = fileArray[0:len(fileArray)-1]
             if len(fileName)>1:
                 fileName='.'.join(fileName)
             else:
                 fileName = fileName[0]
             fileFormat = fileArray[len(fileArray)-1]
             if fileFormat == self.outputFormat:
                 self.dbName = file[:-6]
                 # ahora tengo que armar un archivo de salida para cada una de las bases de datos
                 dbPath =  bases + '/' + fileName
                 output =  self.alignPath+ '/'+queryName+"_"+str(i)
                 #print(output + "   " + dbPath)
                 # ya se tiene la base de datos creada. Crear el comando para buscar la secuencia query en la bd y generar salida
                 print(output)
                 blastnCline = NcbiblastnCommandline(query=query, db=dbPath, evalue=0.001, outfmt=5, out=output, word_size = 11)
                 print(blastnCline)
                 stdout, stderr = blastnCline()
                 i=i+1
Пример #21
0
 def build_commandline(self, query):
     """Build the command line based on the arguments that were provided.
     If local database is provided, will create a command line based on the
     Ncbi____Commandline function, based on which program was specified."""
     #   We build a dictionary of command lines, which we will use to select
     #   the command to run.
     command_dict = {
         'blastn':
         NcbiblastnCommandline(query=self.blastin.name,
                               out=self.blastout.name,
                               db=self.db,
                               evalue=self.evalue,
                               outfmt=5,
                               max_target_seqs=self.maxhits),
         'tblastx':
         NcbitblastxCommandline(query=self.blastin.name,
                                out=self.blastout.name,
                                db=self.db,
                                evalue=self.evalue,
                                outfmt=5,
                                max_target_seqs=self.maxhits)
     }
     if not self.web:
         #   Write the contents of the query sequence into the temp FASTA
         #   file. Unfortunately, command line BLAST only accepts input
         #   files and not sequences
         SeqIO.write(query, self.blastin, 'fasta')
         self.commandline = command_dict[self.prog]
         self.blastin.close()
     return
Пример #22
0
def blastn_analyze():
    tmpFileName = TMP_QUERY_FOLDER + "query-" + str(random.randint(1, 99999))
    tmpFile = open(tmpFileName, "w")
    tmpFile.write(request.args.get('query'))
    tmpFile.close()
    query = tmpFileName
    db = UPLOAD_FOLDER + request.args.get('db')
    result_file = 'tmp_res'
    outFormat = request.args.get('outputFormat')
    blastx_cline = NcbiblastnCommandline(query=query,
                                         db=db,
                                         evalue=50.0,
                                         outfmt=outFormat,
                                         out=TMP_RESULT_FOLDER + result_file)
    stdout, stderr = blastx_cline()
    #f = open(TMP_RESULT_FOLDER+result_file ,'r')
    result = ""
    with open(TMP_RESULT_FOLDER + result_file) as f:
        for line in f:
            #if(outFormat==7):
            #    words=line.split()
            #    result = result+'<tr>'
            #    for c in words:
            #        result = result +'<td>'+c+'</td>'
            #    result = result+'</tr>'
            #else:
            result = result + line
    f.close()
    os.remove(tmpFileName)
    return result
Пример #23
0
def get_top_blast_hit(query, database):
    """
    :param query: Query sequence. Expects an assembly in FASTA format.
    :param database: Database, in fasta format.
    :return: sequence: Top hit from query to the database, reverse complemented if necessary.
    """
    blastn = NcbiblastnCommandline(db=database, outfmt=5, query=query)
    stdout, stderr = blastn()
    count = 0
    j = 0
    sequence = 'NA'
    for record in NCBIXML.parse(StringIO(stdout)):
        if count > 0:
            break
        for alignment in record.alignments:
            for hsp in alignment.hsps:
                if j > 0:
                    break
                if hsp.align_length - hsp.sbjct_start < 1:
                    sequence = str(Seq(hsp.query).reverse_complement())
                else:
                    sequence = str(hsp.query)
                # print(sequence)
                j += 1
                count += 1

    return sequence
Пример #24
0
def run_blast(species_id_path):
    blast_cmd = NcbiblastnCommandline(
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blastn',
        query=query_file,
        db=species_db_dir / species_id_path.stem,
        outfmt=11,
        out=species_out_asn_dir / (species_id_path.stem + ".asn")
        # perc_identity=95
    )
    blast_xml_cmd = NcbiblastformatterCommandline(
        archive=species_out_asn_dir / (species_id_path.stem + ".asn"),
        outfmt=5,
        out=species_out_xml_dir / (species_id_path.stem + ".xml"),
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    blast_txt_cmd = NcbiblastformatterCommandline(
        archive=species_out_asn_dir / (species_id_path.stem + ".asn"),
        outfmt=7,
        out=species_out_txt_dir / (species_id_path.stem + ".txt"),
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    db_file = species_db_dir / (species_id_path.stem + ".ndb")
    if (species_out_xml_dir /
        (species_id_path.stem + ".xml")).exists() is False:
        if db_file.exists() is False:
            blastdb(species_id_path)
        blast_cmd()
        blast_txt_cmd()
        blast_xml_cmd()
Пример #25
0
def passed(candidate_seq, blastn_db, evalue, tmpdir):
    """
    judge a candidate probe can pass(is unique mapped) or not.

    Parameters
    ----------
    candidate_seq : `Seq`
        candidate probe seq.
    blastn_db : str
        Path to blastn database.
    evalue : float
        '-evalue' parameter in blastn
    tmpdir : str
        Path to temporary dir, used for store the blasn result file.
    """
    seq = candidate_seq
    if not os.path.exists(tmpdir):
        os.makedirs(tmpdir)
    seqname = seq.name + "_" + "{}-{}".format(*seq.sub_range)
    tmp_in = os.path.join(tmpdir, seqname + ".fa")
    save_fasta([seq], tmp_in)
    tmp_out = os.path.join(tmpdir, seqname + ".tsv")
    cline = NcbiblastnCommandline(query=tmp_in,
                                  db=blastn_db,
                                  evalue=evalue,
                                  outfmt=6,
                                  out=tmp_out)
    stdout, stderr = cline()
    return is_unique_mapped(tmp_out)
Пример #26
0
def run_blastn(query, subject, evalue, max_hits, max_hsps, identity, keep_query):
    """Run BLASTn"""
    try:
        assert isinstance(query, str)
        assert isinstance(subject, str)
        assert isinstance(evalue, float)
        assert isinstance(max_hits, int)
        assert isinstance(max_hsps, int)
        assert isinstance(identity, float)
        assert isinstance(keep_query, bool)
    except AssertionError:
        raise TypeError
    #   Create an output name
    print("Running BLAST against subject:", subject, file=sys.stderr)
    query_base = os.path.basename(os.path.splitext(query)[0])
    db_base = os.path.basename(os.path.splitext(subject)[0])
    blast_out = os.getcwd() + '/' + query_base + '_' + db_base + '_BLAST.xml'
    #   Setup BLASTn
    blastn = NcbiblastnCommandline(
        query=query,
        subject=subject,
        evalue=evalue,
        outfmt=5,
        max_target_seqs=max_hits,
        max_hsps=max_hsps,
        perc_identity=identity,
        out=blast_out
    )
    #   Run BLASTn
    outfile = run_blastn(cline=blastn, keep_query=keep_query)
    return outfile
Пример #27
0
    def run_blastn(ref_db, query, cpu, max_targets):
        """
        Perform blastn using biopython
        :param ref_db: A fasta file for which "makeblastdb' was already run
        :param query: Protein fasta file
        :param cpu: number of threads
        :param max_targets: maximum targets returned by blast
        :return: blast handle
        """
        # if max_targets > 20:
        #     max_targets = 20  # limit number of sequences to compare

        blastn = NcbiblastnCommandline(db=ref_db,
                                       query=query,
                                       evalue='1e-10',
                                       outfmt=5,
                                       max_target_seqs=max_targets,
                                       num_threads=cpu)
        (stdout, stderr) = blastn()
        if stderr and not 'Warning' in stderr:
            raise Exception(
                'There was a problem with the blast:\n{}'.format(stderr))
        # blast_handle = None
        # if stdout.find('Hsp') != -1:
        #     # Convert stdout (string; blastp output in xml format) to IO object
        #     blast_handle = StringIO(stdout)
        blast_handle = StringIO(stdout)
        return blast_handle
 def blast_alleles(self, runmetadata, amino_acid):
     """
     Run the BLAST analyses on the query
     :param runmetadata: List of metadata objects for each query
     :param amino_acid: Boolean of whether the query sequence is amino acid or nucleotide
     """
     logging.info('Running BLAST analyses')
     for sample in runmetadata.samples:
         if not amino_acid:
             blast = NcbiblastnCommandline(db=os.path.splitext(self.combined_targets)[0],
                                           query=sample.general.bestassemblyfile,
                                           num_alignments=100000000,
                                           evalue=0.001,
                                           num_threads=self.cpus,
                                           task='blastn',
                                           outfmt=self.outfmt,
                                           out=sample.alleles.blast_report)
         else:
             blast = NcbiblastpCommandline(query=sample.general.bestassemblyfile,
                                           db=os.path.splitext(self.combined_targets)[0],
                                           evalue=0.001,
                                           num_alignments=100000000,
                                           num_threads=self.cpus,
                                           outfmt=self.outfmt,
                                           out=sample.alleles.blast_report)
         blast()
def blast(db_file, query_file, blast_out_xml_file, blast_out_asn_file,
          blast_out_txt_file):
    blast_asn_cmd = NcbiblastnCommandline(
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blastn',
        query=query_file,
        db=db_file,
        outfmt=11,
        out=blast_out_asn_file)
    blast_asn_cmd()
    blast_xml_cmd = NcbiblastformatterCommandline(
        archive=blast_out_asn_file,
        outfmt=5,
        out=blast_out_xml_file,
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    blast_xml_cmd()
    blast_txt_cmd = NcbiblastformatterCommandline(
        archive=blast_out_asn_file,
        outfmt=7,
        out=blast_out_txt_file,
        cmd=
        '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter'
    )
    blast_txt_cmd()
Пример #30
0
def blast_fasta(args, iteration):
    database = "".join([
        "/Users/kxs624/Documents/data/NCBI_RNA_database/refseq_rna.0{0} ".
        format(i) for i in range(8)
    ])
    database = "\"" + database + "\""
    print(database)

    if not args.blast_xml:
        print("blasting")
        out_file = os.path.join(args.outfolder,
                                'blast_out_{0}.xml'.format(iteration))
        blastn_cline = NcbiblastnCommandline(query=args.transcripts[iteration],
                                             db=database,
                                             evalue=0.001,
                                             outfmt=5,
                                             out=out_file,
                                             max_target_seqs=20)
        # blastn_cline = NcbitblastxCommandline(query=args.transcripts[iteration], db=database, evalue=0.001, outfmt=5, out=out_file, max_target_seqs=2)
        print(blastn_cline)
        stdout, stderr = blastn_cline()

        print(stdout, stderr)
        print("Done")
        blast_records = NCBIXML.parse(open(out_file, 'r'))
    else:
        blast_records = NCBIXML.parse(open(args.blast_xml[iteration], 'r'))
    return blast_records