def BLAST(query, subject, debug=False): seq1 = SeqRecord(Seq(query), id="seq1") seq2 = SeqRecord(Seq(subject), id="seq2") SeqIO.write(seq1, "seq1.fasta", "fasta") SeqIO.write(seq2, "seq2.fasta", "fasta") # Run BLAST and parse the output as XML if len(query) < 200: output = NcbiblastnCommandline(query="seq1.fasta", subject="seq2.fasta", outfmt=5, max_target_seqs=1, task="blastn-short")()[0] else: output = NcbiblastnCommandline(query="seq1.fasta", subject="seq2.fasta", outfmt=5, max_target_seqs=1)()[0] result = NCBIXML.read(StringIO(output)) qstart, qstop, sstart, sstop = (-9, -9, -9, -9) for Alignment in result.alignments: for i, hsp in enumerate(Alignment.hsps): if i == 0: # Only going to look at the first (best) match qstart = min( hsp.query_start, hsp.query_end ) # Account for fact that alignment could have various orientations qstop = max(hsp.query_start, hsp.query_end) sstart = min(hsp.sbjct_start, hsp.sbjct_end) sstop = max(hsp.sbjct_start, hsp.sbjct_end) if debug: pdb.set_trace() else: break return (qstart, qstop, sstart, sstop)
def blastProcess(threadID, filebase, db, outbase, wordSize, hits=10, constant=False): fasta = filebase % threadID output = outbase % threadID print( "Starting blast of %s against %s..." % (fasta, db) ) if os.path.isfile(db + ".nhr"): if constant: cline = NcbiblastnCommandline(blast_cmd, query=fasta, db=db, out=output, outfmt="\'6 qseqid sseqid pident length mismatch gaps qstart qend sstart send evalue bitscore sstrand\'", gapopen=5, gapextend=2, penalty=-1, reward=1, evalue=1e-3, max_target_seqs=hits, word_size=wordSize, perc_identity=100) else: cline = NcbiblastnCommandline(blast_cmd, query=fasta, db=db, out=output, outfmt="\'6 qseqid sseqid pident length mismatch gaps qstart qend sstart send evalue bitscore sstrand\'", gapopen=5, gapextend=2, penalty=-1, reward=1, evalue=1e-3, max_target_seqs=hits, word_size=wordSize) else: if constant: cline = NcbiblastnCommandline(blast_cmd, query=fasta, subject=db, out=output, outfmt="\'6 qseqid sseqid pident length mismatch gaps qstart qend sstart send evalue bitscore sstrand\'", gapopen=5, gapextend=2, penalty=-1, reward=1, evalue=1e-3, max_target_seqs=hits, word_size=wordSize, perc_identity=100) else: cline = NcbiblastnCommandline(blast_cmd, query=fasta, subject=db, out=output, outfmt="\'6 qseqid sseqid pident length mismatch gaps qstart qend sstart send evalue bitscore sstrand\'", gapopen=5, gapextend=2, penalty=-1, reward=1, evalue=1e-3, max_target_seqs=hits, word_size=wordSize) try: cline() except: print( traceback.format_exc() )
def run(self): global finishcount global totalthreads finished = 'F' while True: if finished == 'T': break self.counter = -1 for lock in self.locks: self.counter += 1 if finished == 'T': break if not lock.locked(): with lock: tempfile = "" if self.genome == "t": tempfile = "Output/AsAlReads/MappedToAt/AtReads_" + str( self.counter) + ".fasta" elif self.genome == "l": tempfile = "Output/AsAlReads/MappedToAl/AlReads_" + str( self.counter) + ".fasta" with open(tempfile, 'w') as outfile: outline = ''.join(self.reads) outfile.write(outline) if self.genome == "t": #self.blastn_cline = NcbiblastnCommandline(cmd='blastn', query=str(tempfile), db="at9db", evalue=0.0001, word_size=9, outfmt=5, out=str(self.outfile)) # For my computer (Windows) self.blastn_cline = NcbiblastnCommandline( cmd='/share/apps/blast+/bin/blastn', query=str(tempfile), db="../at9db", evalue=0.0001, word_size=9, outfmt=5, out=str(self.outfile)) # For Linux server elif self.genome == "l": #self.blastn_cline = NcbiblastnCommandline(cmd='blastn', query=str(tempfile), db="lyratadb", evalue=0.0001, word_size=9, outfmt=5, out=str(self.outfile)) # For my computer (Windows) self.blastn_cline = NcbiblastnCommandline( cmd='/share/apps/blast+/bin/blastn', query=str(tempfile), db="../lyratadb", evalue=0.0001, word_size=9, outfmt=5, out=str(self.outfile)) # For Linux server print(self.blastn_cline) stdout, stderr = self.blastn_cline() if len(stdout) > 0 or len(stderr) > 0: print(stdout, stderr, sep="\n") while True: try: os.remove(tempfile) break except: pass finished = 'T' time.sleep(1) finishcount += 1 percentdone = round(finishcount / totalthreads * 100, 1) print(str(percentdone) + "% done\n")
def query_blast(fa_path, DATA_DIR_PATH, OUTPUT_PATH, DB_NAME, DB_PATH, evalue=0.001, outfmt=5): """Uses the Ncbiblastn to blast the blast DB with the query file Returns the path of the result. """ print("Querying the blast db...") OUT_FILE_NAME = os.path.basename(os.path.splitext(fa_path)[0] + ".xml") OUT = os.path.join(OUTPUT_PATH, OUT_FILE_NAME) cline = NcbiblastnCommandline(cmd='blastn', query=fa_path, db=DB_PATH, evalue=evalue, outfmt=outfmt, out=OUT) cline() # Return path and name of the query's result query = {"result_path": OUT, "result_name": OUT_FILE_NAME} return (query)
def BLAST(fragmentFile, sequenceFile, outputName): coords = [] sequenceFile, runStatus = createBLASTdb(sequenceFile) if runStatus == 0: NcbiblastnCommandline(cmd='blastn', out=outputName, outfmt=5, query=fragmentFile, strand="both", dust="no", db=sequenceFile, evalue=0.001, word_size=7)() #reward=1,penalty=-3, results = NCBIXML.parse(open(outputName, 'r')) coords = [] for result in results: for alignment in result.alignments: for hsp in alignment.hsps: if hsp.strand[0] is not None: print hsp.strand, "WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW" # print hsp # if hsp.expect < 0.00005: coords.append( Coordinate(hsp.sbjct_start - 1, hsp.sbjct_end)) # coords.append([hsp.sbjct_start-1,hsp.sbjct_end]) coords.sort(key=lambda x: x.start) return coords
def thaliana_blast_speedtest(): # Used just for testing purposes to see how quickly BLAST will take given x number of query sequences in a batch. # Time/memory required for BLAST to finish increases quadratically with x. 20,000 sequences per batch appears to be # the highest reasonable amount achievable while the time taken to finish is still approximately linear. # Hence, parallel BLASTing of small 20,000 sequence batches is used in this script. filename = "Output/AlReadsAFew.fasta" # Contains 100,000 or so sequences filename2 = "Output/AlReadsSpeedTest.fasta" # Output containing time taken to BLAST x number of sequences linelist = [] times = [] with open(filename) as infile: linelist = infile.readlines() maxtime = 0 maxlines = 0 while maxtime < 3600: # Continue BLASTing increasingly larger batches until the time a BLAST batch takes exceeds an hour #maxlines += 1000 maxlines += 20000 with open(filename2, 'w') as outfile: outline = ''.join(linelist[0:maxlines - 1]) outfile.write(outline) start = time.time() blastn_cline = NcbiblastnCommandline( query=filename2, db="at9db", evalue=0.0001, outfmt=5, out="Output/AsAlReadsMappedToAt-Test.xml") stdout, stderr = blastn_cline() print(stdout, stderr, sep="\n") total = time.time() maxtime = total - start outstr = str(maxlines) + ": " + str(total) + "\n" times.append(outstr) print(maxlines, "done") print(times) sys.exit(0)
def blast(self): """ Run BLAST analyses of the subsampled FASTQ reads against the NCBI 16S reference database """ printtime('BLASTing FASTA files against {} database'.format( self.analysistype), self.starttime, output=self.portallog) for _ in range(self.cpus): threads = Thread(target=self.blastthreads, args=()) threads.setDaemon(True) threads.start() for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': # Set the name of the BLAST report sample[self.analysistype].blastreport = os.path.join( sample[self.analysistype].outputdir, '{}_{}_blastresults.csv'.format(sample.name, self.analysistype)) # Use the NCBI BLASTn command line wrapper module from BioPython to set the parameters of the search blastn = NcbiblastnCommandline( query=sample[self.analysistype].fasta, db=os.path.splitext(sample[self.analysistype].baitfile)[0], max_target_seqs=1, num_threads=self.threads, outfmt="'6 qseqid sseqid positive mismatch gaps " "evalue bitscore slen length qstart qend qseq sstart send sseq'", out=sample[self.analysistype].blastreport) # Add a string of the command to the metadata object sample[self.analysistype].blastcall = str(blastn) # Add the object and the command to the BLAST queue self.blastqueue.put((sample, blastn)) self.blastqueue.join()
def blast_search(bconf, unique_sequence, database_name, temppath, pseudopath, unique): """Run BLASTN to find sequences within the pseudoscaffold""" # Change do directory containing pseudoscaffold os.chdir(pseudopath) # Are we lacking an outfile for BLAST results? if bconf.get('outfile') == None: blast_out = temppath + '/' + unique + '_temp.xml' # Nope, we got else: blast_out = bconf['outfile'] # Where is the query file? unique_query = temppath + '/' + unique_sequence # Define the BLAST search blastn_cline = NcbiblastnCommandline(query=unique_query, db=database_name, evalue=0.05, max_target_seqs=1, outfmt=5, out=blast_out) print("Running BLAST search") # Run the BLAST search blastn_cline() print("Finished searching") os.chdir(temppath) return (blast_out)
def gfg(): if request.method == "POST": # getting input with name = seq in HTML form ip_sequence = request.form.get("seq") ip_type = request.form.get("ip_type") blast_type = request.form.get("blast_type") database_type = request.form.get("database") my_blast_db = request.form.get("db_typeo") e_value_thresh = request.form.get("evalue") e_value_thresh = float(e_value_thresh) #default e-value if e_value_thresh=="": e_value_thresh=0.05 if ip_type =="fastq": seq_id = ip_sequence.split("\n")[0] #sequence id only seq_fasta = "".join(ip_sequence.split("\n")[1]) #gives only sequence fasta_seq= seq_id + "\n" + seq_fasta elif ip_type =="fasta": seq_id = ip_sequence.split("\n")[0] seq_fasta = ip_sequence.split("\n")[1] fasta_seq= "\n".join(ip_sequence.split("\n")[1:]) if my_blast_db=="": print("1") #blast over internet result_handle=NCBIWWW.qblast(blast_type, database_type, fasta_seq) with open("outputhtml.xml", "w") as save_to: save_to.write(result_handle.read()) result_handle.close() else: #local blast #if loop for each blast type: if blast_type=="blastn": result_handle=NcbiblastnCommandline(cmd=blast_type, query=fasta_seq, db=my_blast_db, evalue=e_value_thresh, out="outputhtml.xml") elif blast_type=="blastp": result_handle=NcbiblastpCommandline(cmd=blast_type, query=fasta_seq, db=my_blast_db, evalue=e_value_thresh, out="outputhtml.xml") elif blast_type=="blastx": result_handle=NcbiblastxCommandline(cmd=blast_type, query=fasta_seq, db=my_blast_db, evalue=e_value_thresh, out="outputhtml.xml") elif blast_type=="tblastx": result_handle=NcbitblastxCommandline(cmd=blast_type, query=fasta_seq, db=my_blast_db, evalue=e_value_thresh, out="outputhtml.xml") elif blast_type=="tblastn": result_handle=NcbitblastnCommandline(cmd=blast_type, query=fasta_seq, db=my_blast_db, evalue=e_value_thresh, out="outputhtml.xml") #blast parsing blast_records = NCBIXML.parse(result_handle) with open("outputhtml.xml") as f: blast_records = NCBIXML.parse(f) blast_record = list(blast_records)[0] return render_template("output.html", blast_record=blast_record, e_value_threshold=e_value_thresh) return render_template("input.html")
def crisprSingle(item, query_virus_dir, output_dir, numThreads): query_name = item.split('.')[0] query_file = os.path.join(query_virus_dir, item) output_file = os.path.join(output_dir, query_name) + '.crispr' crispr_call = NcbiblastnCommandline(query=query_file,db=db_host_crispr_prefix,out=output_file,outfmt="6 qacc sacc evalue", evalue=1,gapopen=10,penalty=-1, gapextend=2,word_size=7,dust='no', task='blastn-short',perc_identity=90,num_threads=numThreads) crispr_call() ''' Parse blast results ''' if os.stat(output_file).st_size == 0: ind = False return ind, None else: query_res = pd.read_table(output_file,header = None) # Sanity check for blastn output format query_res = query_res[query_res[1].apply(lambda x: x.count("|")) == 2] if query_res.shape[0] == 0: return False, None query_res[0] = query_name query_res[1] = query_res[1].apply(lambda x: x.split('|')[-2]) #query_res[1] = [dict_genome[k] for k in list(query_res[1])] query_res[2] = -query_res[2].apply(math.log) df_crispr = query_res.groupby([0,1]).max().unstack(fill_value=0) ind = True return ind, df_crispr.set_index([[query_name]])
def blastn(blast_cmd, query, db, blastout, threads, max_hsps, max_target_seqs, perc_qcov, perc_ident): evalue = 1e-5 blast_fmt = "'6 qseqid stitle pident length qcovhsp qlen slen qstart qend sstart send evalue bitscore'" if db == 'Prophage': blast_db = db_prophage blast_database = 'PHASTER' elif db == 'Plasmid': blast_db = db_plasmid blast_database = 'Refseq_plasmid' blastn_out = NcbiblastnCommandline(cmd=blast_cmd, query=query, db=blast_db, evalue=evalue, outfmt=blast_fmt, out=blastout, num_threads=threads, max_hsps=max_hsps, max_target_seqs=max_target_seqs, qcov_hsp_perc=perc_qcov, perc_identity=perc_ident) stdout, stderr = blastn_out() if os.path.getsize(blastout) > 0: blast_result = open(blastout, 'r').readline() items_blast = blast_result.split('\t') blast_subject = items_blast[1] blast_ident = items_blast[2] blast_length = items_blast[3] blast_cov = items_blast[4] blast_slength = items_blast[6] else: blast_database = blast_subject = blast_ident = blast_length = blast_cov = blast_slength = 'NA' return blast_database, blast_subject, blast_ident, blast_length, blast_cov, blast_slength
def locate(genome, gene_db): prefix = generate_string() query_fn = '{}.query.fasta'.format(prefix) sbjct_fn = '{}.sbjct.fasta'.format(prefix) SeqIO.write(genome, query_fn, 'fasta') SeqIO.write(gene_db, sbjct_fn, 'fasta') blastn_cline = NcbiblastnCommandline( query=query_fn, subject=sbjct_fn, evalue=1e-10, outfmt="'6 qseqid qstart qend length'") stdout, stderr = blastn_cline() csv_reader = csv.reader(stdout.splitlines(), delimiter='\t') intervals = [] for row in csv_reader: if int(row[-1]) > 100: intervals.append([row[0], int(row[1]) - 1, int(row[2])]) else: pass clean(prefix) if intervals: return pybedtools.BedTool(intervals).sort().merge( d=-100) # deal with overlaps no more than 100 bp else: return None
def Runs_local_BLASTn_search(My_Query, BLAST_db_path, BLAST_db_folder_name, BLAST_db_name): Puts_the_query_in_a_text_doc_so_BLAST_can_use_it(My_Query, BLAST_db_path) # Changes_backslashes_to_forward_slashes function creates # strings to input into the Ncbiblast Commandline biopython module. BLAST_db_path_no_backslash = Changes_backslashes_to_forward_slashes( BLAST_db_path) BLAST_db_name_no_backslash = Changes_backslashes_to_forward_slashes( BLAST_db_name) blastncline = NcbiblastnCommandline( query=(BLAST_db_path_no_backslash + "/" + "temporary_file.txt"), db=(BLAST_db_path_no_backslash + "/" + BLAST_db_folder_name + "/" + BLAST_db_name_no_backslash), out=(BLAST_db_path_no_backslash + "/my_xml.xml")) #NcbiblastnCommandline doesn't like python variables, only strings. The only input is temporary_file.txt anyway #NcbiblastnCommandline also doesn't like spaces inside the file pathways # NcbiblastnCommandline doesn't like backslashes in the file pathways (\). # Be sure to change all basckslashes to forwardslashes prior to using NcbiblastnCommandline. blastncline() Deletes_the_temporary_file(BLAST_db_path) #Deletes the temporary text file return
def blast(FastaFile, BlastDB, perID, SeqLen): ''' libraries: from Bio import SeqIO from Bio.Blast.Applications import NcbiblastnCommandline from Bio.Blast import NCBIXML ''' Fasta_Handle = open(FastaFile, "r") for Record in SeqIO.parse(Fasta_Handle, "fasta"): #generate temporary fasta file input, and BLASTxml output TempFasta = tempfile.NamedTemporaryFile() TempFasta.write(">%s\n%s\n" % (Record.id, Record.seq)) TempBlastXML = tempfile.NamedTemporaryFile() #BLAST Record Blast_Command = NcbiblastnCommandline(query=TempFasta.name, db=BlastDB, evalue=1e-10, out=TempBlastXML.name, outfmt=5) std_output, err_output = Blast_Command() TempFasta.close() Result_Handle = open(TempBlastXML.name) Blast_Records = NCBIXML.parse(Result_Handle) #lists UpList = [] DownList = [] FullSeqList = [] Headers = [] #loop over Records, check perID for Blast_Record in Blast_Records: for Alignment in Blast_Record.alignments: for Hsp in Alignment.hsps: Hsp_perID = ( (float(Hsp.positives) / float(Hsp.align_length)) * 100) if Hsp_perID >= int(perID): #call seq function UpStream, DownStream, FullSeq = parse_seq( Blast_Record, Hsp, Record, SeqLen) #create list of seqs and Headers UpList.append(UpStream) DownList.append(DownStream) FullSeqList.append(FullSeq) #create header Sbjct_Name = Alignment.title Sbjct_Edit = Sbjct_Name.replace( ' No definition line', '') Header_String = (str(Sbjct_Edit) + '|' + str(round(Hsp_perID, 1)) + '%' + '|' + str(Hsp.query_start)) Headers.append(Header_String) #print out and close print_out(UpList, Headers, Record.id, '_upstream.fasta') print_out(DownList, Headers, Record.id, '_downstream.fasta') print_out(FullSeqList, Headers, Record.id, '_fullseq.fasta') Result_Handle.close()
def blast_func(samplecfg): blastlog = ['BLAST'] # create index if does not exist if not (os.path.exists(samplecfg.genome + '.nin') and os.path.exists(samplecfg.genome + '.nhr') and os.path.exists(samplecfg.genome + '.nsq')): makedb = NcbimakeblastdbCommandline(cmd='makeblastdb', dbtype='nucl', input_file=samplecfg.genome) stdout, stderr = makedb() blastlog.append('\n\nNcbimakeblastdb\n\n') blastlog.append(stdout) blastlog.append(stderr) # blast sampleblast = NcbiblastnCommandline( task='blastn', query=samplecfg.fasta, db=samplecfg.genome, outfmt=samplecfg.view, evalue=samplecfg.evalue, out=mydir + '/blast_' + samplecfg.sample + '.' + samplecfg.suffix + '.txt') stdout, stderr = sampleblast() blastlog.append('\n\nNcbiblastn\n\n') blastlog.append(stdout) blastlog.append(stderr) return blastlog
def run_blastn(query, database, evalue, max_seqs, max_hsps): """Run BLASTn""" try: assert isinstance(query, str) assert isinstance(database, str) assert isinstance(evalue, float) assert isinstance(max_seqs, int) assert isinstance(max_hsps, int) except AssertionError: raise TypeError # Create an output name query_base = os.path.basename(os.path.splitext(query)[0]) database_base = os.path.basename(os.path.splitext(database)[0]) blast_out = os.getcwd( ) + '/' + query_base + '_' + database_base + '_BLAST.xml' try: validate_db(database) except FileNotFoundError as error: sys.exit(error) # Setup BLASTn blastn = NcbiblastnCommandline(query=query, db=database, evalue=evalue, outfmt=5, max_target_seqs=max_seqs, max_hsps=max_hsps, out=blast_out) # Run BLASTn print(blastn, file=sys.stderr) blastn() if not os.path.exists(blast_out): raise BLASTFailedError return blast_out
def Blast_seq(mirna): with open('mirna.fasta', 'w+') as f: f.write('>' + 'refseq_1' + '\n' + str(mirna)) if os.path.isfile('blast_result.csv'): os.remove('blast_result.csv') blastx_cline = NcbiblastnCommandline(query='mirna.fasta', db="human_mirna", evalue=0.1, outfmt=10, out="blast_result.csv", word_size=7, gapopen=5, gapextend=2, strand='both') stdout, stderr = blastx_cline() list_of_mirna = [] try: with open('blast_result.csv', 'r+') as f: lines = f.read() if '\n' in lines: lines = lines.split('\n') for line in lines: if ',' in line: list_of_mirna.append(line.split(',')[1]) if len(list_of_mirna) > 0: return list_of_mirna else: return None except: return None
def get_homology_count(query_seq_list, db='TAIR10_Whole_Genome', word_size=11, gap_open=5, gape_extend=2, reward=2, penalty=-3, num_threads=core) -> list: if not check_usable(): return ['Cannot be used'] * len(query_seq_list) if not os.path.exists(os.path.join(db_file_path, db + '.nin')) or \ not os.path.exists(os.path.join(db_file_path, db + '.nhr')): is_created = create_db(db_name=db) if not is_created: from backend.utils.log import custom_logger custom_logger.error({ 'action': 'create_db', 'status': 'failed' }) raise db_path = os.path.join(db_file_path, db) homology_list = [] for query_seq in query_seq_list: write_fasta(query_seq) out = NcbiblastnCommandline(db=db_path, query=query_path, word_size=word_size, gapopen=gap_open, gapextend=gape_extend, reward=reward, penalty=penalty, outfmt=6, num_threads=num_threads)()[0] number_of_homology = len(out.splitlines()) homology_list.append(number_of_homology) return homology_list
def query(self, query_dir: Path, config: dict, blast_format: str, headers: tuple) -> pd.DataFrame: """This function queries content of the directory to created database Args: query_dir (Path): Directory containing query files. config (dict): Blast configuration dict. blast_format (str): Blast output format. headers ( tuple(*str) ): Headers matching blast output for final DataFrame. Raises: TypeError: When given obj is of wrong type. FileNotFoundError: When given path does not exist or when given path is not a directory. ValueError: When forbidden blast option was provided. Returns: (pd.DataFrame): Pandas DataFrame containing query results. """ if not isinstance(query_dir, Path): raise TypeError("Given object is not Path object") if not query_dir.exists(): raise FileNotFoundError("Given path does not exist") if not query_dir.is_dir(): raise FileNotFoundError("Given path is not directory") if not isinstance(config, dict): raise TypeError("Config file is not a dict object") if any(kwarg in ('query', 'db', 'outfmt', 'max_target_seqs', 'num_alignments') for kwarg in config.keys()): used = filter( lambda k: k in config.keys(), ('query', 'db', 'outfmt', 'max_target_seqs', 'num_alignments')) raise ValueError( "Given kwargs are not valid in terms of blast usage", list(used)) self._aggregate(query_dir, Path("blast_query.fasta")) try: cmd = NcbiblastnCommandline(query="blast_query.fasta", db=f"{self.name}", outfmt=blast_format, **config) blastn_output = subprocess.run(str(cmd), capture_output=True, shell=True) # Error only occurs if it's not this stupid warning. if blastn_output.stderr and "Examining 5 or more matches" not in blastn_output.stderr.decode( ): raise subprocess.SubprocessError( f"Blastn returned error: {blastn_output.stderr.decode()}") except Exception: raise finally: if Path("blast_query.fasta").exists(): Path("blast_query.fasta").unlink() results_df: pd.DataFrame = pd.read_csv(io.StringIO( blastn_output.stdout.decode()), header=None, names=headers) return results_df
def align(self, query, queryName): self.alignPath = self.outputPath + "/"+queryName self.createFolder(self.alignPath) print("alignPath "+self.alignPath) print("query "+query) print("queryName "+queryName) i=0 print("dbpath es "+self.dbPath) for bases, dirs, files in os.walk(self.dbPath): for file in files: # fileName es "secuencias.fasta" o salida.fasta fileName = self.dbName + "." + self.outputFormat fileArray = file.split('.') fileName = fileArray[0:len(fileArray)-1] if len(fileName)>1: fileName='.'.join(fileName) else: fileName = fileName[0] fileFormat = fileArray[len(fileArray)-1] if fileFormat == self.outputFormat: self.dbName = file[:-6] # ahora tengo que armar un archivo de salida para cada una de las bases de datos dbPath = bases + '/' + fileName output = self.alignPath+ '/'+queryName+"_"+str(i) #print(output + " " + dbPath) # ya se tiene la base de datos creada. Crear el comando para buscar la secuencia query en la bd y generar salida print(output) blastnCline = NcbiblastnCommandline(query=query, db=dbPath, evalue=0.001, outfmt=5, out=output, word_size = 11) print(blastnCline) stdout, stderr = blastnCline() i=i+1
def build_commandline(self, query): """Build the command line based on the arguments that were provided. If local database is provided, will create a command line based on the Ncbi____Commandline function, based on which program was specified.""" # We build a dictionary of command lines, which we will use to select # the command to run. command_dict = { 'blastn': NcbiblastnCommandline(query=self.blastin.name, out=self.blastout.name, db=self.db, evalue=self.evalue, outfmt=5, max_target_seqs=self.maxhits), 'tblastx': NcbitblastxCommandline(query=self.blastin.name, out=self.blastout.name, db=self.db, evalue=self.evalue, outfmt=5, max_target_seqs=self.maxhits) } if not self.web: # Write the contents of the query sequence into the temp FASTA # file. Unfortunately, command line BLAST only accepts input # files and not sequences SeqIO.write(query, self.blastin, 'fasta') self.commandline = command_dict[self.prog] self.blastin.close() return
def blastn_analyze(): tmpFileName = TMP_QUERY_FOLDER + "query-" + str(random.randint(1, 99999)) tmpFile = open(tmpFileName, "w") tmpFile.write(request.args.get('query')) tmpFile.close() query = tmpFileName db = UPLOAD_FOLDER + request.args.get('db') result_file = 'tmp_res' outFormat = request.args.get('outputFormat') blastx_cline = NcbiblastnCommandline(query=query, db=db, evalue=50.0, outfmt=outFormat, out=TMP_RESULT_FOLDER + result_file) stdout, stderr = blastx_cline() #f = open(TMP_RESULT_FOLDER+result_file ,'r') result = "" with open(TMP_RESULT_FOLDER + result_file) as f: for line in f: #if(outFormat==7): # words=line.split() # result = result+'<tr>' # for c in words: # result = result +'<td>'+c+'</td>' # result = result+'</tr>' #else: result = result + line f.close() os.remove(tmpFileName) return result
def get_top_blast_hit(query, database): """ :param query: Query sequence. Expects an assembly in FASTA format. :param database: Database, in fasta format. :return: sequence: Top hit from query to the database, reverse complemented if necessary. """ blastn = NcbiblastnCommandline(db=database, outfmt=5, query=query) stdout, stderr = blastn() count = 0 j = 0 sequence = 'NA' for record in NCBIXML.parse(StringIO(stdout)): if count > 0: break for alignment in record.alignments: for hsp in alignment.hsps: if j > 0: break if hsp.align_length - hsp.sbjct_start < 1: sequence = str(Seq(hsp.query).reverse_complement()) else: sequence = str(hsp.query) # print(sequence) j += 1 count += 1 return sequence
def run_blast(species_id_path): blast_cmd = NcbiblastnCommandline( cmd= '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blastn', query=query_file, db=species_db_dir / species_id_path.stem, outfmt=11, out=species_out_asn_dir / (species_id_path.stem + ".asn") # perc_identity=95 ) blast_xml_cmd = NcbiblastformatterCommandline( archive=species_out_asn_dir / (species_id_path.stem + ".asn"), outfmt=5, out=species_out_xml_dir / (species_id_path.stem + ".xml"), cmd= '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter' ) blast_txt_cmd = NcbiblastformatterCommandline( archive=species_out_asn_dir / (species_id_path.stem + ".asn"), outfmt=7, out=species_out_txt_dir / (species_id_path.stem + ".txt"), cmd= '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter' ) db_file = species_db_dir / (species_id_path.stem + ".ndb") if (species_out_xml_dir / (species_id_path.stem + ".xml")).exists() is False: if db_file.exists() is False: blastdb(species_id_path) blast_cmd() blast_txt_cmd() blast_xml_cmd()
def passed(candidate_seq, blastn_db, evalue, tmpdir): """ judge a candidate probe can pass(is unique mapped) or not. Parameters ---------- candidate_seq : `Seq` candidate probe seq. blastn_db : str Path to blastn database. evalue : float '-evalue' parameter in blastn tmpdir : str Path to temporary dir, used for store the blasn result file. """ seq = candidate_seq if not os.path.exists(tmpdir): os.makedirs(tmpdir) seqname = seq.name + "_" + "{}-{}".format(*seq.sub_range) tmp_in = os.path.join(tmpdir, seqname + ".fa") save_fasta([seq], tmp_in) tmp_out = os.path.join(tmpdir, seqname + ".tsv") cline = NcbiblastnCommandline(query=tmp_in, db=blastn_db, evalue=evalue, outfmt=6, out=tmp_out) stdout, stderr = cline() return is_unique_mapped(tmp_out)
def run_blastn(query, subject, evalue, max_hits, max_hsps, identity, keep_query): """Run BLASTn""" try: assert isinstance(query, str) assert isinstance(subject, str) assert isinstance(evalue, float) assert isinstance(max_hits, int) assert isinstance(max_hsps, int) assert isinstance(identity, float) assert isinstance(keep_query, bool) except AssertionError: raise TypeError # Create an output name print("Running BLAST against subject:", subject, file=sys.stderr) query_base = os.path.basename(os.path.splitext(query)[0]) db_base = os.path.basename(os.path.splitext(subject)[0]) blast_out = os.getcwd() + '/' + query_base + '_' + db_base + '_BLAST.xml' # Setup BLASTn blastn = NcbiblastnCommandline( query=query, subject=subject, evalue=evalue, outfmt=5, max_target_seqs=max_hits, max_hsps=max_hsps, perc_identity=identity, out=blast_out ) # Run BLASTn outfile = run_blastn(cline=blastn, keep_query=keep_query) return outfile
def run_blastn(ref_db, query, cpu, max_targets): """ Perform blastn using biopython :param ref_db: A fasta file for which "makeblastdb' was already run :param query: Protein fasta file :param cpu: number of threads :param max_targets: maximum targets returned by blast :return: blast handle """ # if max_targets > 20: # max_targets = 20 # limit number of sequences to compare blastn = NcbiblastnCommandline(db=ref_db, query=query, evalue='1e-10', outfmt=5, max_target_seqs=max_targets, num_threads=cpu) (stdout, stderr) = blastn() if stderr and not 'Warning' in stderr: raise Exception( 'There was a problem with the blast:\n{}'.format(stderr)) # blast_handle = None # if stdout.find('Hsp') != -1: # # Convert stdout (string; blastp output in xml format) to IO object # blast_handle = StringIO(stdout) blast_handle = StringIO(stdout) return blast_handle
def blast_alleles(self, runmetadata, amino_acid): """ Run the BLAST analyses on the query :param runmetadata: List of metadata objects for each query :param amino_acid: Boolean of whether the query sequence is amino acid or nucleotide """ logging.info('Running BLAST analyses') for sample in runmetadata.samples: if not amino_acid: blast = NcbiblastnCommandline(db=os.path.splitext(self.combined_targets)[0], query=sample.general.bestassemblyfile, num_alignments=100000000, evalue=0.001, num_threads=self.cpus, task='blastn', outfmt=self.outfmt, out=sample.alleles.blast_report) else: blast = NcbiblastpCommandline(query=sample.general.bestassemblyfile, db=os.path.splitext(self.combined_targets)[0], evalue=0.001, num_alignments=100000000, num_threads=self.cpus, outfmt=self.outfmt, out=sample.alleles.blast_report) blast()
def blast(db_file, query_file, blast_out_xml_file, blast_out_asn_file, blast_out_txt_file): blast_asn_cmd = NcbiblastnCommandline( cmd= '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blastn', query=query_file, db=db_file, outfmt=11, out=blast_out_asn_file) blast_asn_cmd() blast_xml_cmd = NcbiblastformatterCommandline( archive=blast_out_asn_file, outfmt=5, out=blast_out_xml_file, cmd= '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter' ) blast_xml_cmd() blast_txt_cmd = NcbiblastformatterCommandline( archive=blast_out_asn_file, outfmt=7, out=blast_out_txt_file, cmd= '/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.1+/bin/blast_formatter' ) blast_txt_cmd()
def blast_fasta(args, iteration): database = "".join([ "/Users/kxs624/Documents/data/NCBI_RNA_database/refseq_rna.0{0} ". format(i) for i in range(8) ]) database = "\"" + database + "\"" print(database) if not args.blast_xml: print("blasting") out_file = os.path.join(args.outfolder, 'blast_out_{0}.xml'.format(iteration)) blastn_cline = NcbiblastnCommandline(query=args.transcripts[iteration], db=database, evalue=0.001, outfmt=5, out=out_file, max_target_seqs=20) # blastn_cline = NcbitblastxCommandline(query=args.transcripts[iteration], db=database, evalue=0.001, outfmt=5, out=out_file, max_target_seqs=2) print(blastn_cline) stdout, stderr = blastn_cline() print(stdout, stderr) print("Done") blast_records = NCBIXML.parse(open(out_file, 'r')) else: blast_records = NCBIXML.parse(open(args.blast_xml[iteration], 'r')) return blast_records