def blastBACTEUK(arg): out=open('bacterial.txt','a') out2=open('eukaryotic.txt','a') records = SeqIO.parse(open(arg), format="fasta") for record in records: try: name = record.id result_handleB = NCBIWWW.qblast("blastx", "nr", record.format("fasta"), ncbi_gi=False, descriptions= "1", alignments="1", format_type="XML", hitlist_size="1", entrez_query='(Bacteria[ORGN] OR Archaea[ORGN])') result_handleE = NCBIWWW.qblast("blastx", "nr", record.format("fasta"), ncbi_gi=False, descriptions= "1", alignments="1", format_type="XML", hitlist_size="1", entrez_query='(Eukaryota[ORGN])') blast_recordsB = NCBIXML.read(result_handleB) blast_recordsE = NCBIXML.read(result_handleE) if blast_recordsB.descriptions: print record.id name = record.id out.write(name + ',' + str(blast_recordsB.alignments[0].hsps[0].expect) + '\n') else: out.write(name + ', no hit' + '\n') if blast_recordsE.descriptions: out2.write(name + ',' + str(blast_recordsE.alignments[0].hsps[0].expect) + '\n') else: out2.write(name + ', no hit' + '\n') except: errorout = open('errorlog.txt','a') error out.write('problem blasting ' + record.id + '\n') errorout.close() out.close() out2.close()
def fetch_indentity_from_local(seq): def extract_prot_id(string): s = string.split('|')[2] s = s.split(' ')[1] return s result = [] record = SeqRecord(Seq(seq), id="tmp", name="", description="") SeqIO.write(record, "tmp.fastaa", "fasta") NcbiblastpCommandline(query='tmp.fastaa', db='_data_/_db_/HUMAN_DB', outfmt=5, out='blastp_human_output.xml')() NcbiblastpCommandline(query='tmp.fastaa', db='_data_/_db_/RODENTS_DB', outfmt=5, out='blastp_rodents_output.xml')() result_handle = open("blastp_human_output.xml") b_record = NCBIXML.read(result_handle) for alignment in b_record.alignments: for hsp in alignment.hsps: if hsp.positives == hsp.identities: result.append(extract_prot_id(alignment.title)) result_handle = open("blastp_rodents_output.xml") b_record = NCBIXML.read(result_handle) for alignment in b_record.alignments: for hsp in alignment.hsps: if hsp.positives == hsp.identities: result.append(extract_prot_id(alignment.title)) return ";".join(result)
def blast_bulk (fasta_file, settings): # The blast modules are imported from biopython from Bio.Blast import NCBIWWW, NCBIXML from Bio import SeqIO # parse the fasta file seq_list = [seq for seq in SeqIO.parse(fasta_file, 'fasta')] # open the fasta file #fasta_open = open(fasta_file, 'r') #fasta_handle = fasta_open.read() blast_list = [] for seq in seq_list: print seq result_handle = NCBIWWW.qblast(settings[0], settings[1], seq.format('fasta'), megablast=settings[3], hitlist_size=settings[2]) blast_list.append(NCBIXML.read(result_handle)) # Blast the sequences against the NCBI nucleotide database # return a list with the blast results #result_handle = NCBIWWW.qblast(settings[0], settings[1], fasta_handle, megablast=settings[3], hitlist_size=settings[2]) #blast_list = [item for item in NCBIXML.parse(result_handle)] return blast_list
def blastTranscript(transcript, blastDB, seqfile): transcript = transcript.strip() spec = seqfile.replace('_prot.fa', '') blastDB = blastDB.replace('.phr', '') #Make fasta file of the individual protein seqiter = SeqIO.parse(open(seqfile), 'fasta') SeqIO.write((seq for seq in seqiter if seq.id in transcript), "temp" + transcript + ".fa", "fasta") blastp_cline = NcbiblastpCommandline(query="temp" + transcript + ".fa", db=blastDB, evalue=1e-10, outfmt=5, out="blast" + transcript + ".xml") stdout, stderr = blastp_cline() result_handle = open("blast" + transcript + ".xml") blast_record = NCBIXML.read(result_handle) E_VALUE_THRESH = 1e-10 alignments = [] #Tabulate all alignments for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH: with open(outfile, 'a') as f: alignments = np.append(alignments, str(alignment.title)) call(["rm", "temp" + transcript + ".fa", "blast" + transcript + ".xml"]) alignments2 = [re.split(' ', aln)[1] for aln in alignments] alignments3 = [re.split('_', aln)[0] for aln in alignments2] return alignments3
def blastinfo(filename): ''' The blastinfo function takes the name of a xml file with the results of a BLAST and prints out some of the parameters used, the max alignment length and the max score, along with their respective sequence accession numbers. Finally, the function returns all accession numbers of the blast results. ''' with open(filename) as file: blast_record = NCBIXML.read(file) print("####### BLAST Parameters #######") print("Query ID:", blast_record.query_id) print("Database:", blast_record.database) print("E-value threshold:", blast_record.expect) print("Match score:", blast_record.sc_match) print("Mismatch score:", blast_record.sc_mismatch) max_score = -999 max_length = -999 result = {} acessions = [] for align in blast_record.alignments: temp = align.title.split("|")[3] acessions.append(temp) if align.length > max_length: max_length = align.length result["Max length"] = (max_length, temp) for hsp in align.hsps: if hsp.expect < 0.05: if hsp.score > max_score: max_score = hsp.score result["Max score"] = (max_score, temp) print(result) return acessions
def blastSearch(sequence_record, less_than_threshold): # Import required packages from Bio.Seq import Seq from Bio.Blast import NCBIWWW, NCBIXML # Convert the sequence record to a sequence (i.e. strip annotations and background) seq = sequence_record.seq # print('1') # Create a handle for the blast search result_handle = NCBIWWW.qblast("blastn", "nt", seq) # print('2') # Create an object to hold results of the blast search blast_records = NCBIXML.read(result_handle) # Create a blank list to hold all the blast records that are beyond a given threshold # print('4') blast_records_threshold = [] # For every returned alignment in the blast records for alignment in blast_records.alignments: # For every high scoring pair in the alignments for hsp in alignment.hsps: # If the hsp.expect value is less than 0.001 if hsp.expect < less_than_threshold: # Add the alignment into a the threshold list blast_records_threshold.append(alignment) print('Number of alignments with hsp.expect < ' + str(less_than_threshold) + ' = ' + str(len(blast_records_threshold))) return blast_records
def _compare_by_blast(input_ref, xref_db, blast_out, subject_blast=False): """Compare all genes in an input file to the output database. """ cl = NcbiblastpCommandline(query=input_ref, db=xref_db, out=blast_out, outfmt=5, num_descriptions=1, num_alignments=0) try: subprocess.check_call(str(cl).split()) # handle BLAST errors cleanly; write an empty file and keep moving except (OSError, subprocess.CalledProcessError): with open(blast_out, "w") as out_handle: out_handle.write("\n") with codecs.open(blast_out, encoding="utf-8", errors="replace") as blast_handle: result = blast_handle.read() for problem in [u"\ufffd"]: result = result.replace(problem, " ") try: rec = NCBIXML.read(StringIO.StringIO(result)) except (xml.parsers.expat.ExpatError, ValueError): rec = None if rec and len(rec.descriptions) > 0: id_info = _normalize_id(rec.descriptions[0].title.split()[1]) return id_info, rec.descriptions[0].bits else: return "", 0
def blast_score(query_cdrs, subject_cdrs): blastOptions = "-evalue=200000 -word_size=2 -matrix='PAM30' -comp_based_stats='0' -outfmt=5" outData ={} for i in range(3): query = "-query <(echo -e '>Name\n" + query_cdrs[i] +"') " subject = "-subject <(echo -e '>Name\n" + subject_cdrs[i] +"') " blastString = "blastp " + query + subject + blastOptions # # Run BLAST and parse the output as XML process = subprocess.Popen( args=blastString, stdout=PIPE, stderr = subprocess.STDOUT, shell=True, executable='/bin/bash', close_fds=True) output=process.communicate()[0] blast_result_record = NCBIXML.read(StringIO(output)) if len(blast_result_record.alignments)>0 : for alignment in blast_result_record.alignments: for hsp in alignment.hsps[0:1]: #save data outData[i] = np.array([hsp.score, hsp.expect, hsp.align_length, alignment.length, hsp.bits]) else: outData[i] = np.array([0, .5, 0, 0, 0]) return np.concatenate((outData[0], outData[1], outData[2]), axis=1)
def blast_gene(seq, database): tempfasta = open('temp.fasta', 'w') SeqIO.write(seq, tempfasta, 'fasta') tempfasta.close() run = blastp(query='temp.fasta', db=database, num_descriptions=5, num_threads=6, outfmt=5, out='temp.xml') run() result_handle = open('temp.xml') result = NCBIXML.read(result_handle) rets = [] for i in result.descriptions: ttl = i.title e = i.e if 'Tfl|' in ttl: species = 'T. flavus' d = ttl[ttl.find('Tfl'):] elif 'Pfu|' in ttl: species = 'P. funiculosum' d = ttl[ttl.find('Pfu'):] elif 'PMAA_' in ttl: species = 'T. marneffei' d = ttl[ttl.find('PMAA'):] else: species = ttl[ttl.find('[') + 1:ttl.find(']')] d = ttl[ttl.find('| ') + 1:ttl.find('[') - 1] rets.append(species) rets.append(d) rets.append(str(e)) return rets
def parse(self): """ Call the report parsing method for all the BLAST output files """ logging.info('Parsing outputs') # Call parse_report for every file for sample in self.samples: if os.path.isfile(sample.blast_outputs): # Read in the BLAST results try: with open(sample.blast_outputs, 'r') as result_handle: logging.info( 'Parsing {sn} nr report'.format(sn=sample.name)) blast_record = NCBIXML.read(result_handle) # Iterate through all the alignments for alignment in blast_record.alignments: # Iterate through each HSP per alignment for hsp in alignment.hsps: # Only retrieve sequences that are as long as the query sequence, and do not have gaps if len(hsp.sbjct) == len( sample.records ) and '-' not in hsp.sbjct: # Do not allow for more than five mismatches if hsp.identities >= len( sample.records) * (self.cutoff / 100): # Create a Seq object to add to the set sample.alleleset.add(hsp.sbjct) except FileNotFoundError: pass
def __init__(self, ID, raw_blast_result, blast_object=None): self.db_index = ID self.blast_result = blast_object self.pursue = 0 self.hits = [] if not self.blast_result: with open(raw_blast_result, 'r') as record: self.blast_result = NCBIXML.read(record) self.clone = self.blast_result.query for align in self.blast_result.alignments: for hit in align.hsps: genome = align.accession organism = align.hit_def identity = float(Decimal(hit.identities) / Decimal(hit.align_length)) self.hits.append((self.db_index, genome, organism, hit.expect, hit.query_start, hit.query_end, hit.sbjct_start, hit.sbjct_end, identity)) self.colonizer_matches = [item for item in self.hits if (item[1] in COLONIZER or item[2].split(' ') in COLONIZER)] self.non_colonizer_matches = [item for item in self.hits if (item[1] in NON_COLONIZER or item[2].split(' ') in NON_COLONIZER)] if len(self.non_colonizer_matches) == 0: self.pursue = 1
def searchBlast(fastafile, e): """ Submits a FASTA DNA sequence file to the NCBI Blast web service to determine the genome that the sequence belongs to. Parameters: fastafile : A DNA sequence in the FASTA file format, a string. e : E value threshold, e values of the alignment must be below this value, a float. Returns: A list with each item a list of alignment elements. """ results = [] fasta_string = open(fastafile).read() result_handle = NCBIWWW.qblast("blastn","nt", fasta_string) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: alignment_record = [] if hsp.expect < e: alignment_record.append(alignment.title) alignment_record.append(alignment.length) alignment_record.append(hsp.expect) alignment_record.append(hsp.query) alignment_record.append(hsp.match) alignment_record.append(hsp.sbjct) results.append(alignment_record) return results
def downloadUniprotSequences(uniprotID, blastFile, sequencesFile, cutoff, verbose): print('Obtaining sequences from UniProt...') with open(blastFile, 'r') as f: records = NCBIXML.read(f) if verbose: print('Found ' + str(len(records.alignments)) + ' matches') with open(sequencesFile, 'w') as f: if uniprotID != None: sequence = urllib2.urlopen('http://www.uniprot.org/uniprot/' + uniprotID + '.fasta') f.write(sequence.read() + "\n") for idx, alignment in enumerate(records.alignments): for hsp in alignment.hsps: title = alignment.title words = title.split('|') seqID = '' if words[0] == 'gi': seqID = words[3] elif words[0] == 'sp' or words[0] == 'ref': seqID = words[1] if not seqID == '': identityPercent = 100.0 * float(hsp.identities) / float( hsp.align_length) if (identityPercent >= float(cutoff)): try: sequence = urllib2.urlopen( 'http://www.uniprot.org/uniprot/' + seqID + '.fasta') f.write(sequence.read() + "\n") if verbose: print(seqID + " (identity " + str(identityPercent) + "% >= cutoff " + str(cutoff) + "%) - adding") except Exception as e: if verbose: print("WARNING: unable to download entry " + seqID + " from Uniprot: " + str(e)) print("Trying NCBI protein...") handle = Entrez.efetch(db="protein", id=seqID, rettype="fasta", retmode="xml") erecords = Entrez.parse(handle) for erecord in erecords: r = SeqRecord( Seq.Seq(erecord['TSeq_sequence'], IUPAC.unambiguous_dna), id=erecord['TSeq_accver'], description=erecord['TSeq_defline']) SeqIO.write(r, f, "fasta") f.write("\n") if verbose: print("OK") handle.close() else: if verbose: print(seqID + " (identity " + str(identityPercent) + "% < cutoff " + str(cutoff) + "%) - skipping")
def blast_seq(seq): print("Blasting...") # local blast with open("my_fas.fas", "w") as my_fasta: my_fasta.write(">new seq\n" + seq) job_id = blast_runner("my_fas.fas", outfile="my_blast.xml", hitlist_size=1) #pbs job id # try: # process = subprocess.check_output("qstat | grep " + str(job_id), shell=True) # while process != "": # process = subprocess.check_output("qstat | grep " + str(job_id), shell=True) # sleep(0.05) # except (subprocess.CalledProcessError): # process = "" # if process == "": # print("Blasted!") status = check_pbs(job_id) print(status) if status == "Done!": xml_file = open("my_blast.xml", "r") blast_record = NCBIXML.read(xml_file) xml_file.close() try: for alignment in blast_record.alignments: for hsp in alignment.hsps: title = (str(alignment.title).split("|")[4]) return title except (RuntimeError, TypeError, NameError, ValueError): return None
def compareSequences(seq1, seq2): """ compareSequences(seq1, seq2) -> Alignment Tworzy obiekt prównania dwóch sekwencji zwartych w plikach .fasta. """ output = '' try: output = NcbiblastpCommandline( query=seq1, subject=seq2, outfmt=5, use_sw_tback=True)()[0] except Bio.Application.ApplicationError as err: print('Brak programu Blast w ścieżce systemowej') print(err) # print('Próba połączenia się z wersją online...') # try: # seq1 = open(seq1).read() # output = NCBIWWW.qblast( # "blastp", 'nt', sequence=seq1, query_file=open(seq2)).read() # except IOError: # raise ValueError('Nie udało się pobrać danych') exit() if not output: return blast_result_record = NCBIXML.read(StringIO(output)) alignment = blast_result_record.alignments[0] hsp = alignment.hsps[0] return Alignment(query=hsp.query, match=hsp.match, subject=hsp.sbjct)
def write_flanks(rbase,flanksfile): ''' Parse the results from BLASTing the F-plasmid against the de novo assemblies. get the query length, get the first BLAST hit that matches the 3'-end of the query, and write the flanking region to file. ''' flank_record_list = [] ## iterate over BLASTs against de novo assemblies. denovo_dirs = [x for x in listdir(rbase) if x.startswith('REL') or x.startswith('RM')] for mygenome in denovo_dirs: myfulldir = join(rbase, mygenome) ##print(myfulldir) result_f = join(myfulldir,"results.xml") result_h = open(result_f) blast_record = NCBIXML.read(result_h) query_length = int(blast_record.query_letters) for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect > 0.0000000001: ## skip bad hits. continue if hsp.query_end != query_length: ## skip hits that don't match 3' end of F-plasmid query. continue subject_seq = join(myfulldir,"scaffolds.fasta") ##print(mygenome) my_flank_seq = get_flank(alignment, hsp, subject_seq) flank_record_list.append(SeqRecord(seq=my_flank_seq, id=mygenome+'_flank')) with open(flanksfile,'w') as flanks_outhandle: SeqIO.write(flank_record_list,flanks_outhandle, format="fasta")
def parse_blast(seq, output): blast_output = StringIO(output) try: blast_records = NCBIXML.read(blast_output) except ValueError as e: sys.stderr.write("-----Blast output------") sys.stderr.write(blast_output.getvalue()) if blast_output.getvalue( ) == "BLAST engine error: XML formatting is only supported for a database search": sys.stderr.write( "Please ensure that you are using the latest blastx version of blastn" ) sys.stderr.write( "You may need to update your environment's PATH variable") raise e try: alignment = blast_records.alignments[0] except: return (output, -1) hsp = alignment.hsps[0] mutations = get_muts(hsp.query, hsp.sbjct) return (mutations, 1)
def get_sequences(): ana_dir = "Analysis" bla_dir = "BLAST" BLASTWriter.simple_dir(bla_dir) root_dir = "NRPSRoot" xml_dir = "BLASTXML" BLASTWriter.create_dir(xml_dir, bla_dir) stan_dir = "BLASTStandard" BLASTWriter.create_dir(stan_dir, bla_dir) fas_dir = "BLASTFASTA" main_dir = [] for [dirpath, dirname, filename] in os.walk(root_dir): main_dir.extend(filename) for file in main_dir: if file[0:len(file)-4] not in os.listdir(os.path.join(ana_dir, os.path.join(bla_dir, stan_dir))): record = SeqIO.read(os.path.join(root_dir, file), format="gb") #BLASTExecute.blast_execute(record) result_handle = open(os.path.join(ana_dir, os.path.join(bla_dir, os.path.join(xml_dir, "BLAST-" + record.name + ".xml")))) blast_record = NCBIXML.read(result_handle) k = 0 e_threshold = .00000001 rec_dir = record.name for alignment in blast_record.alignments: k += 1 i = 0 al_dir = alignment.title[:alignment.title.index(" ")] BLASTWriter.create_dir(os.path.join(stan_dir, os.path.join(rec_dir, al_dir)), bla_dir) BLASTWriter.create_dir(os.path.join(fas_dir, os.path.join(rec_dir, al_dir)), bla_dir) BLASTWriter.write_full_standard(k, alignment, record) for hsp in alignment.hsps: if hsp.expect < e_threshold: i += 1 BLASTWriter.write_blast_standard(i, alignment, hsp, rec_dir) BLASTWriter.write_blast_fasta(i, alignment, hsp, rec_dir)
def get_seq_pos(fasta_list): seq_pos_list = [] for i, _seq in enumerate(fasta_list): if i % 500 == 0: print('{} has been processed!'.format(i)) seq1 = SeqRecord(Seq(_seq)) SeqIO.write(seq1, "seq1.fasta", "fasta") # Run BLAST and parse the output as XML try: output = NcbiblastnCommandline(query="seq1.fasta", subject="parent.fasta", outfmt=5)()[0] blast_result_record = NCBIXML.read(StringIO(output)) # Print some information on the result if blast_result_record.alignments != []: hsps = blast_result_record.alignments[0].hsps if len(hsps) == 2: results = [] for hsp in hsps: results.append(hsp.sbjct_start) results.append(hsp.sbjct_end) seq_pos_list.append(sorted(results)) except: print('failed to blast!') continue return seq_pos_list
def globalRun(d_dataset, p_dir_blast, debug=1): for PDB_ID in d_dataset.keys(): if d_dataset[PDB_ID]["conserve"] == 1: p_fasta = d_dataset[PDB_ID]["best"]["fasta"] p_out_blast = p_dir_blast + PDB_ID + ".xml" blastp_cline = NcbiblastpCommandline(query=p_fasta, db="pdb", outfmt=5, out=p_out_blast) if debug: print blastp_cline if not path.exists(p_out_blast): stdout, stderr = blastp_cline() d_dataset[PDB_ID]["xml"] = p_out_blast d_dataset[PDB_ID]["align"] = {} # parse blast out result_handle = open(p_out_blast) blast_records = NCBIXML.read(result_handle) for alignment in blast_records.alignments: for hsp in alignment.hsps: # print alignment.title PDB_find = alignment.title.split("|")[4].split(" ")[0] d_dataset[PDB_ID]["align"][PDB_find] = hsp.expect result_handle.close()
def blastpSp(sp, db, evalue=0.0001): """ directory = tempfile.mkdtemp() fasta = fetchFasta(spAcc) fastaFile = '%s/seq.fasta' % directory wf = open(fastaFile, 'w') print(fasta, file=wf) wf.close() """ directory = tempfile.mkdtemp() fastaFile = '%s/seq.fasta' % directory fasta = '>query\n%s' % seq(sp) wf = open(fastaFile, 'w') print(fasta, file=wf, sep='', end='') wf.close() blastp = NcbiblastpCommandline(query=fastaFile, db=db, evalue=evalue, outfmt=5, out='%s/result.xml' % directory) stdout, stderr = blastp() print(stdout, end='', sep='') print(stderr, end='', sep='') result_handle = open('%s/result.xml' % directory) blast_record = NCBIXML.read(result_handle) result_handle.close() os.remove(fastaFile) os.remove('%s/result.xml' % directory) os.removedirs(directory) hits = [align.title for align in blast_record.alignments] hits = [i.split('|')[1] for i in hits] return hits
def write_blast(str1, str2, name1, name2): ''' Blast two sequences in fasta format Input: str1: the first sequence string str2: the second sequence string name1: the first sequence name name2: the second sequence name Return: None ''' seq1 = SeqRecord(Seq(str1), id=name1) seq2 = SeqRecord(Seq(str2), id=name2) SeqIO.write(seq1, "seq1.fasta", "fasta") SeqIO.write(seq2, "seq2.fasta", "fasta") output = NcbiblastpCommandline(query="seq1.fasta", subject="seq2.fasta", outfmt=5)()[0] blast_result_record = NCBIXML.read(StringIO(output)) for alignment in blast_result_record.alignments: for hsp in alignment.hsps: print('****Alignment****') print('sequence:', alignment.title) print('length:', alignment.length) print('e value:', hsp.expect) print(hsp.query) print(hsp.match) print(hsp.sbjct)
def run (self, input_seq): output = [] #Windows has problems with Popen and PIPE if sys.platform == 'win32': tmp = tempfile.NamedTemporaryFile() logger.debug("Running Blast with sequence: {}".format(input_seq)) tmp.write(bytes(str(input_seq) + '\n', 'latin1')) tmp.seek(0) blast = Popen('%s -db %s -outfmt 5' % (self.blast_path, self.blastdb), universal_newlines=True, stdin=tmp, stdout=PIPE, stderr=PIPE) (blast_out, blast_err) = blast.communicate() else: #Rest of the world: blast = Popen('%s -db %s -outfmt 5' % (self.blast_path, self.blastdb), universal_newlines=True, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE) (blast_out, blast_err) = blast.communicate(input=str(input_seq)) if len(blast_err) != 0: logger.debug(blast_err) if blast_out!='\n': result = NCBIXML.read(StringIO(blast_out)) for aln in result.alignments[:self.top_results]: logger.debug("Looping over alignments, current hit: {}".format(aln.hit_id)) output.append((aln.hit_id, aln)) return output
def blast_pdb(target_sequence, num_hits=1000): """ Query the PDB using NCBI blast and return MSMSeeds initialized with the results Parameters ---------- target_sequence : String The sequence of the target to use to query blast num_hits : int, optional The maximum number of hits returned by BLAST. Default: 1000 Returns ------- msmseeds : list of MSMSeed objects A list of MSMSeed objects initialized with a target sequence, template sequence, template structure, and BLAST e-value. Can be readily parallelized in Spark. """ from Bio.Blast import NCBIWWW, NCBIXML result_handle = NCBIWWW.qblast("blastp", "pdb", target_sequence, hitlist_size=num_hits) blast_record = NCBIXML.read(result_handle) alignments = blast_record.alignments msmseeds = [] for alignment in alignments: e_val = alignment.hsps[0].expect template_fasta, template_structure = _retrieve_chain( alignment.accession) msmseeds.append( MSMSeed(target_sequence, template_fasta, template_structure, e_val)) return msmseeds
def blast(dbname, blast_program, query, evalue_threshold=0.001): infile = None with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: infile = f.name f.write(">Query\n%s\n" % query) outfile = "%s.out.xml" % infile if blast_program == 'tblastn': blast_cl = NcbitblastnCommandline(query=infile, db=dbname, evalue=evalue_threshold, word_size=6, outfmt=5, out=outfile) else: blast_cl = NcbiblastnCommandline(query=infile, db=dbname, evalue=evalue_threshold, word_size=6, outfmt=5, out=outfile) cl = str(blast_cl) cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl) r = subprocess.call(cl.split(" ")) os.unlink(infile) if r != 0: print "Blast failed: %s" % cl return [] results = [] with open(outfile, "r") as f: blast_record = NCBIXML.read(f) for alignment in blast_record.alignments: accession = Blast_Accession(alignment.accession) for hsp in alignment.hsps: if accession.fragment_length is not None: if hsp.sbjct_start > accession.fragment_length and \ hsp.sbjct_end > accession.fragment_length: continue # don't apply '% accession.fragment_length' to # sbjct_start/end. Blast_Result#strand compares sbjct_start # and sbjct_end to determine which strand the hit is on. # Caller should just handle when sbjct_start/end is greater # than fragment length. alternatively, we can store strand # explicit, but that also creates complexity when using # sbjct_start/end coordinates. f = Blast_Result(fragment_id=accession.fragment_id, fragment_length=accession.fragment_length, hit_def=alignment.hit_def, query_start=hsp.query_start, query_end=hsp.query_end, subject_start=hsp.sbjct_start, subject_end=hsp.sbjct_end, evalue=hsp.expect, alignment=dict(query=hsp.query, match=hsp.match, matchi=inverse_match(hsp.match), subject=hsp.sbjct)) results.append(f) os.unlink(outfile) return results
def blast_sequences(comp_seq, ref_seq): ''' Perform BLAST of two protein sequences using NCBI BLAST+ package. Output is two dictionaries: residue numbering in PDB chain (key) mapped to the residue position in the reference sequence (value), and vice versa. Notes: User must have NCBI BLAST+ package installed in user's PATH. Args: comp_seq (str): A comparison protein sequence. ref_seq (str): A reference protein sequence. Returns: dict: A dictionary mapping comparison sequence numbering (key) to reference sequence numbering (value) dict: A dictionary mapping reference sequence numbering (key) to comparison sequence numbering (value) ''' with tempfile.NamedTemporaryFile(mode='w') as comp_seq_file, \ tempfile.NamedTemporaryFile(mode='w') as ref_seq_file: comp_seq_file.write(">\n" + str(comp_seq) + "\n") ref_seq_file.write(">\n" + str(ref_seq) + "\n") ref_seq_file.flush() comp_seq_file.flush() blastp_cline = NcbiblastpCommandline(query=comp_seq_file.name, subject=ref_seq_file.name, evalue=0.001, outfmt=5) alignment, _stderror = blastp_cline() blast_xml = StringIO(alignment) blast_record = NCBIXML.read(blast_xml) temp_score = 0 high_scoring_hsp = None #Retrieve highest scoring HSP for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.score > temp_score: temp_score = hsp.score high_scoring_hsp = hsp #Create dictionary mapping position in PDB chain to position in ref sequence pdb_to_ref = {} ref_to_pdb = {} if high_scoring_hsp is not None: query_string = high_scoring_hsp.query sbjct_string = high_scoring_hsp.sbjct key = high_scoring_hsp.query_start ref = high_scoring_hsp.sbjct_start for i, res in enumerate(query_string): if res.isalpha() and sbjct_string[i].isalpha(): pdb_to_ref[key] = ref ref_to_pdb[ref] = key key += 1 ref += 1 elif res.isalpha(): key += 1 elif sbjct_string[i].isalpha(): ref += 1 return pdb_to_ref, ref_to_pdb
def search(self, blast_program, query, e_val): """Runs BLAST to search query sequence in the target database. Args: blast_program (string): BLAST flavor to run. 'tblastx' or 'tblastn' query (string): query sequence in FASTA format eval (float): E-value threshold Returns: Bio.Blast.Record.Blast object """ assert blast_program in ['tblastn', 'tblastx', 'blastx'] # temporary file in temp directory, automatically named output_file = temp_file_name() # temporary file in temp directory, automatically named query_file = temp_file_name() with open(query_file, 'w') as f: f.write(query) cmd = '{prog} -query {q} -db {db} -evalue {e} -out {out} -outfmt 5'.format( prog=blast_program, q=query_file, db=self._db_file, e=e_val, out=output_file) logging.debug(cmd) os.system(cmd) # Parse results with open(output_file) as results_handle: blast_record = NCBIXML.read(results_handle) return blast_record
def parse_output(query, output): length_of_record = len(SeqIO.read(query, 'fasta')) result_handle = open(output) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < 0.01: percentage = (hsp.identities / length_of_record) * 100 percentage2 = str(percentage) variant = alignment.title.split(' ')[-1] e_value = str(hsp.expect) organism_one, organism_two = organism_finder(query) write_blast_results(organism_one, organism_two, query, percentage2, variant, e_value) if percentage != 100.0: add_to_file(query) make_new_blast_db() break
def parseRecord(xmlfile,genomePath,debug): if debug: print "In BLASTing.parseRecord" result = nxml.read(open('Files/extras/temp_blast.xml')) hit = result.alignments[0].hit_def e = result.descriptions[0].e if debug: print "Blast match: ",hit print "E-value: ",e hitL = hit.split() hitID = hitL[0] t = [n for n in hitL if '..' in n] hitInfo = t[0] num1,num2 = hitInfo.split('..') num2 = num2[:num2.find('(')] num1,num2 = int(num1),int(num2) strand = hitInfo[hitInfo.find('('):] # Determine the direction, relative location, and position of the gene direction = getDirection(hitInfo) termUpper,termLower = getRelativeLocation(genomePath) pos = getLocation(num1,termUpper,termLower) # TODO # Integrate warning for multiple hits return num1,direction,pos,hit,e,''
def _test_describe(): blast_record = NCBIXML.read( open( os.path.join(global_settings.temp_folder, 'blastpdb', 'S438966_blast.xml'))) print(blast_record.alignments) print(blast_record.alignments[0].title)
def get_evalues(): with open('C:/Users/Alyssa/Desktop/CSE182/project/blast_eval.txt', 'w') as outfile: # Parse all raw BLAST output files in directory for rawblast_file in os.listdir(file_path): # Load the BLAST result back onto handle result_handle = open(file_path + rawblast_file) # Parse the BLAST output blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: # Can have more than one result due to protein redundancy for hsp in alignment.hsps: e_value = hsp.expect outfile.write(str(e_value) + ',') outfile.write('\n') # Strip the trailing comma in each row with open('C:/Users/Alyssa/Desktop/CSE182/project/blast_eval.txt', 'r') as readin: with open( 'C:/Users/Alyssa/Desktop/CSE182/project/FINAL_blast_eval.txt', 'w') as eval_out: for line in readin: line = line.rstrip(',\n') eval_out.write(str(line)) eval_out.write('\n')
def onlineParsing(self): ''' Analyze online blast XML output files ''' '''Parsing on the XML files from the online blast.''' print("Online parsing ...") with open("Online_BLAST_results", "w") as results: for fasta in self.list_fasta: results_handle = open( "Data/output_blast/Online_output_{}".format(fasta)) blast_record = NCBIXML.read(results_handle) results_handle.close() results.write("{}\n".format(fasta)) '''Only the 10 best alignments are saved''' i = 0 for alignment in blast_record.alignments: hsp = alignment.hsps[0] if i < 10: identity = str(hsp.identities) + "/" + str( hsp.align_length) results.write("{}\t{}\n".format( alignment.title, identity)) i += 1 else: break results.write("\n") results.close() print("Online parsing done")
def init(blast_output_path): with GraknClient(uri="localhost:48555") as client: with client.session(keyspace="proteins") as session: print("Connected to the proteins knowledge graph.") print("- - - - - - - - - - - - - - - - - - - - -") target_sequences = query_target_sequences(session) for sequence in target_sequences: print("BLASTing for: ", sequence) print("- - - - - - - - - - - - - - - - - - - - -") print( "Waiting for BLAST search to complete. This can take a few minutes." ) # result_handle = NCBIWWW.qblast( # "blastp", # "nr", # sequence # ) # print("Reading BLAST results") # print("- - - - - - - - - - - - - - - - - - - - -") # with open('./blast-output.xml', 'w') as output_file: # output_file.write(result_handle.read()) blast_record = NCBIXML.read(open(blast_output_path)) print( "Inserting BLAST results into the proteins knowledge graph." ) print("- - - - - - - - - - - - - - - - - - - - -") insert_new_proteins_n_alignments(session, sequence, blast_record)
def searchBlast(fastafile, e): """ Submits a FASTA DNA sequence file to the NCBI Blast web service to determine the genome that the sequence belongs to. Parameters: fastafile : A DNA sequence in the FASTA file format, a string. e : E value threshold, e values of the alignment must be below this value, a float. Returns: A list with each item a list of alignment elements. """ results = [] fasta_string = open(fastafile).read() result_handle = NCBIWWW.qblast("blastn", "nt", fasta_string) blast_record = NCBIXML.read(result_handle) for alignment in blast_record.alignments: for hsp in alignment.hsps: alignment_record = [] if hsp.expect < e: alignment_record.append(alignment.title) alignment_record.append(alignment.length) alignment_record.append(hsp.expect) alignment_record.append(hsp.query) alignment_record.append(hsp.match) alignment_record.append(hsp.sbjct) results.append(alignment_record) return results
def run(self, input_seq): output = [] #Windows has problems with Popen and PIPE if sys.platform == 'win32': tmp = tempfile.NamedTemporaryFile() logger.debug("Running Blast with sequence: {}".format(input_seq)) tmp.write(bytes(str(input_seq) + '\n', 'latin1')) tmp.seek(0) blast = Popen('%s -db %s -outfmt 5' % (self.blast_path, self.blastdb), universal_newlines=True, stdin=tmp, stdout=PIPE, stderr=PIPE) (blast_out, blast_err) = blast.communicate() else: #Rest of the world: blast = Popen('%s -db %s -outfmt 5' % (self.blast_path, self.blastdb), universal_newlines=True, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE) (blast_out, blast_err) = blast.communicate(input=str(input_seq)) if len(blast_err) != 0: logger.debug(blast_err) if blast_out != '\n': result = NCBIXML.read(StringIO(blast_out)) for aln in result.alignments[:self.top_results]: logger.debug("Looping over alignments, current hit: {}".format( aln.hit_id)) output.append((aln.hit_id, aln)) return output
def blast_gene(seq, database): tempfasta = open('temp.fasta', 'w') SeqIO.write(seq, tempfasta, 'fasta') tempfasta.close() run = blastn(query='temp.fasta', db=database, num_descriptions=1, num_threads=6, outfmt=5, word_size=4, evalue=0.01, task="megablast", out='temp.xml') run() result_handle = open('temp.xml') result = NCBIXML.read(result_handle) rets = [] for i in result.descriptions: ttl = i.title e = i.e species = ttl.split(' ')[0] rets.append(species) rets.append(str(e)) for i in result.alignments: for j in i.hsps: rets.append(str(j.frame[1])) rets.append(str(j.query)) rets.append(str(j.match)) rets.append(str(j.sbjct_start)) return rets
def draw_blast(path="C:/Users/arvid/Documents/arbeit/Blast.xml", max_entry=10, yMax=1000, xMax=1000): handle = open(path) blast_record = NCBIXML.read(handle) E_VALUE_THRESH = 0.01 plot_werte = [] dy = yMax / max_entry y = yMax - dy index = 0 for alignment in blast_record.alignments: y = y - dy / 2 index = index + 1 for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH and index < max_entry: plt.text( hsp.query_end, y, " " + str(index) + " " + alignment.title[alignment.title.find("PREDICTED: ") + 11:alignment.title.find("PREDICTED: ") + 31]) plt.plot([hsp.query_start, hsp.query_end], [y, y], 'g-') plt.plot([0, xMax], [yMax - dy, yMax - dy], 'r-') plt.ylim((0, yMax)) cur_axes = plt.gca() cur_axes.axes.get_yaxis().set_visible(False) plt.show()
def findOffTargets (refSeq,sgRNAseq): candidates=[] # Return this list of candidates f = open('temp.fasta','wb') f.write(sgRNAseq+'\n') f.close() cline = NcbiblastnCommandline(query="temp.fasta", db="testdb",outfmt=5, out="temp.xml",task='blastn-short') cline() result=open('temp.xml','r') records = NCBIXML.read(result) if len(records.alignments) == 0 : return candidates records=records.alignments[0].hsps for record in records: if record.query_end < 20: # Require ends at the seed continue if record.match[-5:] != '|'*5: # Require 5 bp of seed is perfect match #print record continue if record.sbjct_end > record.sbjct_start: end=record.sbjct_end # on the + strand, sequence is from [start,end] if refSeq[end+2:end+4]=='GG': candidates.append(record) else: # On the - strand end=record.sbjct_end if refSeq[end-3:end-1] == 'CC': candidates.append(record) return candidates
def blast_pdb(target_sequence, num_hits=1000): """ Query the PDB using NCBI blast and return MSMSeeds initialized with the results Parameters ---------- target_sequence : String The sequence of the target to use to query blast num_hits : int, optional The maximum number of hits returned by BLAST. Default: 1000 Returns ------- msmseeds : list of MSMSeed objects A list of MSMSeed objects initialized with a target sequence, template sequence, template structure, and BLAST e-value. Can be readily parallelized in Spark. """ from Bio.Blast import NCBIWWW, NCBIXML result_handle = NCBIWWW.qblast("blastp", "pdb", target_sequence, hitlist_size=num_hits) blast_record = NCBIXML.read(result_handle) alignments = blast_record.alignments msmseeds = [] for alignment in alignments: e_val = alignment.hsps[0].expect template_fasta, template_structure = _retrieve_chain(alignment.accession) msmseeds.append(MSMSeed(target_sequence,template_fasta, template_structure, e_val)) return msmseeds
def get_blast_alignments(seq, query): ncbi = NCBIWWW.qblast(program="blastn" , database="nr", sequence=seq, entrez_query=query, format_type="XML", hitlist_size = 500, expect = 100.0) blast = NCBIXML.read(ncbi); remove_alignments = [] query_length = len(seq); #results = [] #for alignment in blast.alignments: #positive = alignment.hsps[0].positives * 100 / 80 #if positive >= 80: #results.append(alignment) #return results for alignment in blast.alignments: overall_length = 0.0 for hsp in alignment.hsps: overall_length += hsp.align_length if (overall_length / query_length) < 0.8: remove_alignments.append(alignment) for alignment in remove_alignments: blast.alignments.remove(alignment) return blast.alignments;
def execute_blast(self, id_seq, limit, evalue): seq = self.db.getSeq(id_seq) if seq == None: print(Messages.nonexistent_sequence()) return blast_result = NCBIWWW.qblast("blastn","nr", seq.getSeq(), hitlist_size= limit) file_blast = open(".blast_result.xml", "w") file_blast.write(blast_result.read()) file_blast.close() file_blast=open(".blast_result.xml") blast_record = NCBIXML.read(file_blast) file_blast.close() for alignment in blast_record.alignments: for hsps in alignment.hsps: if hsps.expect < evalue: print(f"sequence: {alignment.title}") print(f"accession: {alignment.accession}") print(f"length: {alignment.length}") print(f"e value: {hsps.expect}") print(f"score: {hsps.score}") print(f"identities: {hsps.identities}") else: print("Inferior to the provided evalue!")
def parse(infolder, outfolder): if not os.path.exists( os.path.join(global_settings.temp_folder, outfolder)): os.mkdir(os.path.join(global_settings.temp_folder, outfolder)) for file in os.listdir( os.path.join(global_settings.temp_folder, infolder)): if '.xml' in file: try: blast_record = NCBIXML.read( open( os.path.join(global_settings.temp_folder, infolder, file))) matches = [] for align in blast_record.alignments: for hsp in align.hsps: if hsp.score > 100: pdb = align.title.split('|')[3] chain = align.title.split('|')[4][0] d = { 'x': int(hsp.query_start), 'y': int(hsp.align_length + hsp.query_start), 'description': align.hit_def.split('>')[0], 'id': 'blastpdb_{p}_{x}_{y}_{c}'.format( p=pdb, c=chain, x=hsp.query_start, y=hsp.align_length + hsp.query_start), 'chain': chain, 'url': pdb, 'offset': hsp.sbjct_start - hsp.query_start, 'extra': { 'match': align.title[0:50], 'match_score': hsp.score, 'match_start': hsp.query_start, 'match_length': hsp.align_length, 'match_identity': hsp.identities / hsp.align_length } } matches.append(d) with open( os.path.join(global_settings.temp_folder, outfolder, file.replace('.xml', '.json')), 'w') as w: json.dump(matches, w) except ValueError as err: warn(f'Value error for {file}: {err}' ) ##why art thou so empty?
def BlastGenome(queryFile,genome,debug,outputFile='Files/extras/temp_blast.xml'): if debug: print "In BLASTing.BlastGenome" # Modify the genome filename to reflect the path to the genome genome = genome.replace(' ','') genomePath = 'Files/genome/' + genome + '/' + genome ## Call blast+ from python cline = ncl(query=queryFile,db=genomePath,out=outputFile,outfmt=5) ret_code = subprocess.call(str(cline),shell=True) if ret_code: print 'BLASTing file "%s" returned error code %s' % (queryFile,ret_code) temp = open(queryFile).read() geneID = temp.split()[0] geneID = geneID.lstrip('>') result = nxml.read(open(outputFile)) # If the blast returns no results, it will be treated as a gene # in the ambiguous region and oligos will be made from both strands if result.alignments: return parseRecord(result,genomePath,debug) else: return 0,0,'Ambiguous','No Match','N/A'
def __init__(self, ID, raw_blast_result, blast_object=None): self.db_index = ID self.blast_result = blast_object self.pursue = 0 self.hits = [] if not self.blast_result: with open(raw_blast_result, 'r') as record: self.blast_result = NCBIXML.read(record) self.clone = self.blast_result.query for align in self.blast_result.alignments: for hit in align.hsps: genome = align.accession organism = align.hit_def identity = float( Decimal(hit.identities) / Decimal(hit.align_length)) self.hits.append((self.db_index, genome, organism, hit.expect, hit.query_start, hit.query_end, hit.sbjct_start, hit.sbjct_end, identity)) self.colonizer_matches = [ item for item in self.hits if (item[1] in COLONIZER or item[2].split(' ') in COLONIZER) ] self.non_colonizer_matches = [ item for item in self.hits if (item[1] in NON_COLONIZER or item[2].split(' ') in NON_COLONIZER ) ] if len(self.non_colonizer_matches) == 0: self.pursue = 1
def runBlast(runtype, sequence): #Format sequence using FASTA standard fastaFormat = ">Test\n%s\n" % sequence blastType = "" # Set correct type of BLAST search to be performed if (runtype == "n"): # Nucleotide blastType = "blastn" db = "nt" elif (runtype == "p"): # Amino Acid / Protein blastType = "blastp" db = "nr" else: # Raises Error if improper Blast type is set. This is for debugging purposes as the blast type cannot be implicitly changed by the user raise Exception("INVALID BLAST TYPE") # Run BLAST query result_handle = NCBIWWW.qblast(blastType, db, fastaFormat) # Read BLAST result into BLAST object blast_record = NCBIXML.read(result_handle) hitString = "" # Format result for display to User for alignment in blast_record.alignments: for hsp in alignment.hsps: hitString += ( "*****Alignment*****\n sequence: %s\n length: %s\n e value: %s\n %s...\n %s...\n %s... " % (alignment.title, alignment.length, hsp.expect, hsp.query[0:75], hsp.match[0:75], hsp.sbjct[0:75])) # Return formatted result return hitString
def blastdemo(genbankID): # run blastp on the swissprot database NB to scale this up we must do it locally on cluster result_handle = NCBIWWW.qblast("blastp", "swissprot", genbankID) # read the results as XML blast_record = NCBIXML.read(result_handle) # Set this value to ridiculously low E_VALUE_THRESH = 0.00000000000000001 # for each alignment found, display the one with the lowest e-value, and also protein function information. for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH: print ("****Alignment****") print ("sequence:", alignment.title) print ("length:", alignment.length) print ("e value:", hsp.expect) print (hsp.query[0:75] + "...") print (hsp.match[0:75] + "...") print (hsp.sbjct[0:75] + "...") print "\n" ### h is not defined yet, Will (problem from iPython nb's!) # print h.query[0:75] + '...' # print h.match[0:75] + '...' # print h.sbjct[0:75] + '...' for a in blast_record.alignments: print a.length
def downloadNCBISequences(blastFile, sequencesFile, cutoff, verbose): print('Obtaining sequences from NCBI...') with open(blastFile, 'r') as f: records = NCBIXML.read(f) if verbose: print('Found ' + str(len(records.alignments)) + ' matches') with open(sequencesFile, 'w') as f: sequences = [] for idx, alignment in enumerate(records.alignments): for hsp in alignment.hsps: title = alignment.title words = title.split('|') seqID = words[3] identityPercent = 100.0 * float(hsp.identities) / float(hsp.align_length) if (identityPercent >= float(cutoff)): sequences.append(seqID); if verbose: print(seqID + " (identity " + str(identityPercent) + "% >= cutoff " + str(cutoff) + "%) - adding") else: if verbose: print(seqID + " (identity " + str(identityPercent) + "% < cutoff " + str(cutoff) + "%) - skipping") try: handle = Entrez.efetch(db="nuccore", id=",".join(sequences), rettype="fasta", retmode="xml") records = Entrez.parse(handle) DNAsequences = [] for record in records: DNAsequences.append( SeqRecord( Seq.Seq(record['TSeq_sequence'], IUPAC.unambiguous_dna ), id=record['TSeq_accver'], description=record['TSeq_defline']) ) SeqIO.write(DNAsequences, f, "fasta") handle.close() except Exception as e: print("WARNING: unable to download this entry: " + str(e)) print('OK')
def read_blast_xml(filename, **kwargs): """Read BLAST XML format.""" # Read file. with open(filename, 'r') as f: blast_record = NCBIXML.read(f) # Prepare DataFrame fields. data = { 'accession': [], 'hit_def': [], 'hit_id': [], 'title': [], 'length': [], 'e_value': [], 'sequence': [] } # Get alignments from blast result. for i, s in enumerate(blast_record.alignments): data['accession'] = s.accession data['hit_def'] = s.hit_def data['hit_id'] = s.hit_id data['title'] = s.title data['length'] = s.length data['e_value'] = s.hsps[0].expect data['sequence'] = s.hsps[0].sbjct # Port to DataFrame. return pd.DataFrame(data)
def blast_test(): '''BLAST result interpretation Given several BLAST result xml, load the results, and then do the sorting. Compare the generated result with manual input result. ''' blast_object1 = BlastRecord(55, 'tests/test_data/blast/single_blast1.xml') assert_equal(blast_object1.match(), ( 55, 'HM991502', 'Pseudomonas fluorescens strain Q8r1-96 type III secretion gene cluster, complete sequence', 0.0, 20, 992, 11854, 10866, 0.98, 1)) blast_object2 = BlastRecord(46, 'tests/test_data/blast/single_blast2.xml') assert_equal(blast_object2.match(), ( 46, 'CP002585', 'Pseudomonas brassicacearum subsp. brassicacearum NFM421, complete genome', 0.0, 19, 525, 636636, 636111, 0.96, 0)) with open('tests/test_data/blast/single_blast2.xml', 'r') as handle: blast = NCBIXML.read(handle) multi_test = BlastRecord(65, 'dummy_place_holder', blast) assert_equal(multi_test.match(), ( 65, 'CP002585', 'Pseudomonas brassicacearum subsp. brassicacearum NFM421, complete genome', 0.0, 19, 525, 636636, 636111, 0.96, 0))
def blast(sequence, db): infile = None feature_list = [] input = clean_dna_sequence(sequence) input2 = input+input with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: infile = f.name f.write(">Query\n%s\n" % (input2,)) outfile = "%s.out.xml" % (infile,) blast_cl = NcbiblastnCommandline(query=infile, db="%s/%s" % (settings.NCBI_DATA_DIR, db), evalue=0.001, word_size=6, outfmt=5, out=outfile) cl = str(blast_cl) cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl) r = subprocess.call(cl.split(" ")) if r != 0: raise Exception("Blast failed: %s" % (cl,)) with open(outfile, "r") as f: blast_record = NCBIXML.read(f) for alignment in blast_record.alignments: accession = Blast_Accession(alignment.accession) for hsp in alignment.hsps: #print "seq %s %s %s" % (accession.type, accession.feature_length, alignment.hit_def,) #print 'identities %s/%s' % (hsp.identities, len(hsp.query)) #print 'qs %s-%s, ms %s-%s' % (hsp.query_start, hsp.query_end, hsp.sbjct_start, hsp.sbjct_end) #print ' '+hsp.query[0:75] + '...' #print ' '+hsp.match[0:75] + '...' #print ' '+hsp.sbjct[0:75] + '...' percent = 100.0*hsp.identities/(1.0*len(hsp.sbjct)) if percent < 85: # this is some what arbitrary... continue start = hsp.query_start end = hsp.query_end if hsp.sbjct_end > hsp.sbjct_start: clockwise = True hit_start = hsp.sbjct_start hit_end = hsp.sbjct_end else: clockwise = False hit_end = hsp.sbjct_start hit_start = hsp.sbjct_end feature = alignment.hit_def if hit_start != 1 or hit_end != accession.feature_length: feature = '%s (%s-%s/%s)' % (feature, hit_start, hit_end, accession.feature_length) if start <= len(input): end = end % len(input) f = Aligned_Feature(feature, alignment.hit_def, start, end, clockwise, accession.type, hsp.query, hsp.match, hsp.sbjct) feature_list.append(f) os.unlink(outfile) os.unlink(infile) return feature_list
def get_blast_records(self): if not os.path.isfile(self.search_output_file): blast_output = NCBIWWW.qblast(self.program, self.database, self.record.seq, self.entrez_query, 500, 100.0) with open(self.search_output_file, "w") as tempFile: tempFile.write(blast_output.read()) blast_file = open(self.search_output_file) return NCBIXML.read(blast_file)
def run(self, input_seq): output = [] result = NCBIXML.read(NCBIWWW.qblast(self.blast_program, self.db, input_seq, auto_format='xml')) for aln in result.alignments[:self.top_results]: logger.debug("Looping over alignments, current hit: {}".format(aln.hit_id)) output.append((aln.hit_id, aln)) return output
def blast_test(): """BLAST result interpretation Given several BLAST result xml, load the results, and then do the sorting. Compare the generated result with manual input result. """ blast_object1 = BlastRecord(55, "tests/test_data/blast/single_blast1.xml") assert_equal( blast_object1.match(), ( 55, "HM991502", "Pseudomonas fluorescens strain Q8r1-96 type III secretion gene cluster, complete sequence", 0.0, 20, 992, 11854, 10866, 0.98, 1, ), ) blast_object2 = BlastRecord(46, "tests/test_data/blast/single_blast2.xml") assert_equal( blast_object2.match(), ( 46, "CP002585", "Pseudomonas brassicacearum subsp. brassicacearum NFM421, complete genome", 0.0, 19, 525, 636636, 636111, 0.96, 0, ), ) with open("tests/test_data/blast/single_blast2.xml", "r") as handle: blast = NCBIXML.read(handle) multi_test = BlastRecord(65, "dummy_place_holder", blast) assert_equal( multi_test.match(), ( 65, "CP002585", "Pseudomonas brassicacearum subsp. brassicacearum NFM421, complete genome", 0.0, 19, 525, 636636, 636111, 0.96, 0, ), )
def get_BLAST(taxid, queryseq): ''' Input taxid to BLAST queryseq against ''' e_query = "txid" + taxid + " [ORGN]" #, other_advanced='-G 4 -E 1' blast_result = NCBIWWW.qblast("blastn", "nt", queryseq, megablast=True, entrez_query=e_query, word_size='11', other_advanced='-G 5 -E 2') #, other_advanced='-G 4 -E 1' return NCBIXML.read(blast_result)
def seqdist(seq1,seq2): cline = NcbiblastpCommandline(query=seq1, subject=seq2, outfmt=5, evalue=100000000) out,err = cline() result = NCBIXML.read(StringIO.StringIO(out)) if len(result.alignments) == 0: return -1 return result.alignments[0].hsps[0].expect
def adaptor_blast(query,dbpatch="adaptor.fasta"): # build the blast db, maybe adding an asserting to identify the exsentise of the db is better db=dbpatch.split(".")[0] print myexe("makeblastdb -in %s -dbtype nucl -input_type fasta -out %s" % (dbpatch,db)) blastn_cline = NcbiblastnCommandline(db=db, outfmt=5) out, err = blastn_cline(stdin=query) blast_records = NCBIXML.read(StringIO(out)) # return is a generator, need a loop to parse the result return blast_records
def blast(self, input="search_output.xml"): blastOutput = NCBIWWW.qblast(self.program, self.database, self.record.seq, entrez_query=self.entrezQuery, format_type=self.formatType) outputFile = open(input, "w") outputFile.write(blastOutput.read()) result = open(input) return NCBIXML.read(result);
def blast_seqs(each_seq): ''' Takes a sequence and runs a blast search ''' # My blast parameters, let me know if they could be better optimised for shorter primers blast_handle = NCBIWWW.qblast("blastn", "nt", each_seq, expect=0.04, hitlist_size = 1000, word_size=7) blast_result = NCBIXML.read(blast_handle) blast_handle.close() for alignment in blast_result.alignments: for hsp in alignment.hsps: return alignment.title
def parseBlastFile(xmlfil): """ Input ----- Uniprot XML Output parseBlastFile(xmlfil) e.g. parsebBlastFile('O00238_blast.xml') Description ----------- Parses the following information out of the bast output xml file. Uniprot | PDBid | chain | query_to | query_from | Iter Query Len | e-value | Query Coverage | Sequence Identity Output ------- Uniprot.csv """ NP_id = xmlfil.split('_')[0] result_handle = open(xmlfil) blast_record = NCBIXML.read(result_handle) result_handle.close() #E_VALUE_THRESH = 1E-25 outfilname = NP_id+'.csv' with open(outfilname,"w") as out_file: print "Writing output to %s"%(outfilname) out_file.write('Uniprot,PDBid,chain,query_to,query_from,IterQueryLen,e-value,QueryCov,SeqId\n') sequencequeryLength = blast_record.query_length for alignment in blast_record.alignments: for hsp in alignment.hsps: first = float(hsp.identities) second = len(hsp.query) identity = 100*float(first/second) identity = int(round(identity,0)) coverage = round(100*float(hsp.query_end - hsp.query_start)/sequencequeryLength,0) line1=alignment.title b=line1.split('|') pdbid = str(b[3]) out_file.write(NP_id+",") out_file.write(pdbid+",") line2=b[4] chain=line2.split() out_file.write(str(chain[0]) +",") out_file.write(str(hsp.query_end)+",") out_file.write(str(hsp.query_start)+",") out_file.write(str(sequencequeryLength)+",") out_file.write(str(hsp.expect) +",") out_file.write("%f"%coverage +",") out_file.write(str(identity) +"\n")
def runBlast(cline, bOutFile, locus_sbjct): os.system(str(cline)) rec = open(bOutFile) blast_record = NCBIXML.read(rec) if os.path.isfile(locus_sbjct): os.remove(locus_sbjct) os.remove(bOutFile) return blast_record