def read_write_and_compare(self, source_file, source_format, out_file, out_format, **kwargs): """Compares read QueryResults after it has been written to a file.""" source_qresult = SearchIO.read(source_file, source_format, **kwargs) SearchIO.write(source_qresult, out_file, out_format, **kwargs) out_qresult = SearchIO.read(out_file, out_format, **kwargs) self.assertTrue(compare_search_obj(source_qresult, out_qresult))
def parse_write_and_compare(self, source_file, source_format, out_file, out_format, **kwargs): """Compares parsed QueryResults after they have been written to a file.""" source_qresults = list(SearchIO.parse(source_file, source_format, **kwargs)) SearchIO.write(source_qresults, out_file, out_format, **kwargs) out_qresults = list(SearchIO.parse(out_file, out_format, **kwargs)) for source, out in zip(source_qresults, out_qresults): self.assertTrue(compare_search_obj(source, out))
def parse_write_and_compare(self, source_file, source_format, out_file, out_format, **kwargs): """Compares parsed QueryResults after they have been written to a file.""" source_qresults = list( SearchIO.parse(source_file, source_format, **kwargs)) SearchIO.write(source_qresults, out_file, out_format, **kwargs) out_qresults = list(SearchIO.parse(out_file, out_format, **kwargs)) for source, out in zip(source_qresults, out_qresults): self.assertTrue(compare_search_obj(source, out))
def start_queryResult_generator(inFile, fDic, work_sheet): """ invoking the parse function to return a 'generator' that can allow you to step though the record one QueryResult Object at a time but invoking nextQuery = (next)generator on it.This approach can allow you to save on memory. I have found with my current task casting this generator with (list) works fine but it is really not called for in this current task of parsing and sorting the records. """ """ http://biopython.org/DIST/docs/api/Bio.SearchIO.BlastIO-module.html""" qGenerator = SearchIO.parse(inFile, 'blast-xml') max_hits = 0 query_count = 1 # Step through all the records in the lump xml data file and write out # each separate hit to file. Also write the summary information to the # work sheet. for query_result in qGenerator: print('Processing Query BLAST return ' + str(query_count)) number_hits = int(len(query_result.hits)) # Extend header out right if new MAXHITS if number_hits > max_hits: max_hits = number_hits if number_hits == 0: # Construct path plus file name for no hit query filename = str(fDic['topDir'] + fDic['noHit'] + 'Query_' + str(query_count) + '_H_none.xml') # Write out any Queries that had to hits to a no Hit subfolder SearchIO.write(query_result, filename, 'blast-xml') write_qr_to_ws(query_count, query_result, work_sheet) else : # Now set up a counter of 'hits' in the QueryResult so hit's # can be sliced away into their own record cleanly. hit_count = 0; for hit in query_result.hits: total_hsps = len (hit.hsps) lowest_eval = hit.hsps[0].evalue best_hsp = hit.hsps[0] for hsp in hit.hsps: if hsp.evalue < lowest_eval: lowest_eval = hsp.evalue best_hsp = hsp filename = str(fDic['topDir'] + outputFileName(query_count, hit, best_hsp)) SearchIO.write(query_result[hit_count:(hit_count + 1)], filename , 'blast-xml') hit_count += 1 # Write out query_result to worksheet write_qr_to_ws(query_count, query_result, work_sheet) query_count += 1 # break is debugging code # if query_count == 20: # break build_ws_header(work_sheet, max_hits) return qGenerator
def MergeBLASTsAndWrite(results): """ Merge all individual BLASTp searches together and write to file in tabular format (without comments this time). One thing to note is we have to fool SearchIO into correctly parsing all the results as one big "file" by removing the last two lines ("# BLAST processed x queries" &c) from each result object while we're merging everything together (see join line). """ # Filter last two lines of each BLASTp result and join remaining lines together, making one big SearchIO object. logging.info( "BLASTAll: Merging all-vs.-all results together and parsing into tabular format." ) merged = "\n".join( (["\n".join(result.split("\n")[:-2]) for result in results if result])) parsed = SearchIO.parse(cStringIO.StringIO(merged), "blast-tab", comments=True) # Write merged BLASTp results to file for PanOCT. logging.info("BLASTAll: Writing BLASTp results to file panoct.blast.") SearchIO.write(parsed, "panoct.blast", "blast-tab")
def setUp(self): ff, blast_xml = tempfile.mkstemp(prefix='rba_test_xml', suffix='_t21') blast = SearchIO.read(blast_in, 'blast-xml') blast1 = deepcopy(blast) blast2 = deepcopy(blast) dbblast = [ blast1.hsp_filter(lambda x: x.bitscore_raw >= 95), blast2.hsp_filter(lambda x: 80 < x.bitscore_raw < 95) ] with os.fdopen(ff, 'w') as b: SearchIO.write(dbblast, b, 'blast-xml') self.blast_xml = blast_xml ff, double_fasta = tempfile.mkstemp(prefix='rba_test_double_fasta', suffix='_t22') with os.fdopen(ff, 'w') as t, open(blast_query, 'r') as s: q = s.read() t.write('{}\n{}\n'.format(q, q)) self.query_double = double_fasta ff, csv = tempfile.mkstemp(prefix='rba_', suffix='_t7.csv') os.close(ff) self.csv = csv ff, html = tempfile.mkstemp(prefix='rba_', suffix='_t8.html') os.close(ff) self.html = html ff, pandas_dump = tempfile.mkstemp(prefix='rba_', suffix='_t9.pandas_dump') os.close(ff) self.pandas_dump = pandas_dump ff, json_file = tempfile.mkstemp(prefix='rba_', suffix='_t10.json') os.close(ff) self.json = json_file
def deleteFromAlignResult(self, projectPath, db, file, alignResult): blastResultPath = projectPath + "/" + alignResult + "/" + db for bases, dirs, files in os.walk(blastResultPath): for f in os.listdir(bases): filePath = bases + '/' + f # si el nombre del archivo a leer contiene el nombre del archivo a borrar, borrarlo if (self.contains(f, file[:-3])): os.remove(filePath) # sino puede estar adentro la alineacion else: with open(filePath) as originalXml: result = SearchIO.read(originalXml, "blast-xml") i = 0 for hits in result: hsp = hits[0] id = hsp.hit.id if (id == file[:-3]): result.hit_keys.pop(i) result.hits.pop(i) result.hsps.pop(i) result.pop(i) i = i + 1 os.remove(filePath) SearchIO.write(result, filePath, 'blast-xml')
def write_filtered_xml(self): print('Writing output file...') SearchIO.write(self.filtered_list, self.output_file, format='blast-xml')
def iterator(blast_dict): for entry in blast_dict: entry_hits = [] for hit in blast_dict[entry].hits: if hit.id not in black_list: # filter hits entry_hits.append(hit) if entry_hits: yield QueryResult(hits=entry_hits, id=entry) elif args.mode == "both": def iterator(blast_dict): for entry in blast_dict: if entry not in black_list: entry_hits = [] for hit in blast_dict[entry].hits: if hit.id not in black_list: # filter hits entry_hits.append(hit) if entry_hits: yield QueryResult(hits=entry_hits, id=entry) blast_results = SearchIO.index(args.input, args.format) SearchIO.write(iterator(blast_results), args.output, args.format) if args.output != "output": out_fd.close() if args.input != "stdin": in_fd.close()
def __main__(): #Parse Command Line parser = argparse.ArgumentParser() #- a default argument for the blast file parser.add_argument('blast_file', type=str, help="A blast file to process") parser.add_argument('-o', type=str, help="A name for the output file") args = parser.parse_args() blast_data = [] # check for a valid file if not os.path.isfile( args.blast_file ): sys.exit("\n%s is not a valid file!\n" % args.blast_file) else: # this sets the output file output_file = "" if args.o is None: output_file = "filtered_blast.txt" else: output_file = args.o # # Class object for what is being read in is: # http://biopython.org/DIST/docs/api/Bio.SearchIO._model.query.QueryResult-class.html # num_total_qresults = 0 num_filtered_qresults = 0 for qresult in SearchIO.parse(args.blast_file, 'blast-text'): # if no hits we just keep going if len(qresult.hits) == 0: continue else: num_total_qresults += 1 hits_in_qresult = sum([len(h) for h in qresult.hits]) if hits_in_qresult == 1: blast_data.append(qresult) else: num_filtered_qresults += 1 # commented out this stuff! # this counts the total number of hits loaded for this query #num_total_qresults += sum([len(h) for h in qresult.hits]) #filtered_qresult = filter_max_bitscore(qresult) #num_filtered_qresults += sum([len(h) for h in filtered_qresult.hits]) #blast_data.append(filtered_qresult) #pdb.set_trace() print "Filted out %d positions from %d total query results" % (num_filtered_qresults, num_total_qresults) output_handle = open(output_file, "w") count = SearchIO.write(blast_data, output_handle, 'blast-tab') # not working just yet print "Wrote out %s blast records to '%s'" % (count,output_file) output_handle.close()
from Bio.Blast.Applications import NcbiblastnCommandline from Bio import SearchIO humdb="/mithril/Data/Pacbio/Aligned/151019_proc/blast/humiso_blast" blastn_cline=NcbiblastnCommandline(query="temp.fasta", db=humdb, gapopen=1, gapextend=2, word_size=9, reward=1, evalue=10, outfmt=5, out="try.xml") stdout, stderr=blastn_cline() bres=SearchIO.read("try.xml", 'blast-xml') SearchIO.write(bres, 'try.tsv', 'blast-tab') ##ok - this was nice, but can't output because blast is pairwise, and I think we actually want a MAF
from Bio.Blast.Applications import NcbiblastnCommandline from Bio import SearchIO humdb = "/mithril/Data/Pacbio/Aligned/151019_proc/blast/humiso_blast" blastn_cline = NcbiblastnCommandline(query="temp.fasta", db=humdb, gapopen=1, gapextend=2, word_size=9, reward=1, evalue=10, outfmt=5, out="try.xml") stdout, stderr = blastn_cline() bres = SearchIO.read("try.xml", 'blast-xml') SearchIO.write(bres, 'try.tsv', 'blast-tab') ##ok - this was nice, but can't output because blast is pairwise, and I think we actually want a MAF