def check_raw(self, filename, id, raw, **kwargs): """Index filename using keyword arguments, check get_raw(id)==raw.""" idx = SearchIO.index(filename, self.fmt, **kwargs) raw = _as_bytes(raw) # Anticipate cases where the raw string and/or file uses different # newline characters ~ we set everything to \n. new = idx.get_raw(id) self.assertTrue(isinstance(new, bytes), "Didn't get bytes from %s get_raw" % self.fmt) self.assertEqual(raw.replace(b'\r\n', b'\n'), new.replace(b'\r\n', b'\n')) idx.close() # Now again, but using SQLite backend if sqlite3: idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs) new = idx.get_raw(id) self.assertTrue(isinstance(new, bytes), "Didn't get bytes from %s get_raw" % self.fmt) self.assertEqual(raw.replace(b'\r\n', b'\n'), new.replace(b'\r\n', b'\n')) idx.close() if os.path.isfile(filename + ".bgz"): # Do the tests again with the BGZF compressed file print("[BONUS %s.bgz]" % filename) self.check_raw(filename + ".bgz", id, raw, **kwargs)
def handle_input(filename): sys.stdout.write("Handling %s\n" % filename) not_significant_ids = IdList() not_found_ids = IdList() prefix = FileRoutines.split_filename(filename)[1] index_file = "%s.tmp.idx" % prefix hmm_dict = SearchIO.index_db(index_file, filename, args.format) if args.output == "stdout": out_fd = sys.stdout else: out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w") out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write( "%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) else: not_significant_ids.append(query) else: not_found_ids.append(query) if args.output != "stdout": out_fd.close() os.remove(index_file) return not_significant_ids, not_found_ids
def extract_top_hits(hmmer_hits, top_hits_file, top_hits_ids_file=None, not_significant_ids_file=None, not_found_ids_file=None): top_hits_ids = IdList() not_significant_ids = IdList() not_found_ids = IdList() index_file = "hmmer_hits.tmp.idx" hmm_dict = SearchIO.index_db(index_file, hmmer_hits, "hmmer3-text") out_fd = open(top_hits_file, "w") out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write("%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) top_hits_ids.append(query) else: not_significant_ids.append(query) else: not_found_ids.append(query) os.remove(index_file) if not_significant_ids_file: not_significant_ids.write(not_significant_ids_file) if not_found_ids_file: not_found_ids.write(not_found_ids_file) if top_hits_ids_file: top_hits_ids.write(top_hits_ids_file)
def check_index(self, filename, format, **kwargs): # check if Python3 installation has sqlite3 try: import sqlite3 except ImportError: sqlite3 = None parsed = list(SearchIO.parse(filename, format, **kwargs)) # compare values by index indexed = SearchIO.index(filename, format, **kwargs) self.assertEqual(len(parsed), len(indexed.keys())) # compare values by index_db, only if sqlite3 is present if sqlite3 is not None: db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs) self.assertEqual(len(parsed), len(db_indexed.keys())) for qres in parsed: idx_qres = indexed[qres.id] # parsed and indexed qresult are different objects! self.assertNotEqual(id(qres), id(idx_qres)) # but they should have the same attribute values self.assertTrue(compare_search_obj(qres, idx_qres)) # sqlite3 comparison, only if it's present if sqlite3 is not None: dbidx_qres = db_indexed[qres.id] self.assertNotEqual(id(qres), id(dbidx_qres)) self.assertTrue(compare_search_obj(qres, dbidx_qres)) indexed._proxy._handle.close() # TODO - Better solution if sqlite3 is not None: db_indexed.close() db_indexed._con.close()
def parse_search_file(input_file, mode, format="hmmer3-text", index_file=None): if mode == "index_db" or ((not isinstance(input_file, str)) and (len(input_file) > 1)): index = index_file if index_file else "tmp.idx" seq_dict = SearchIO.index_db(index, [input_file] if isinstance(input_file, str) else input_file, format=format) elif mode == "index": seq_dict = SearchIO.index(input_file if isinstance(input_file, str) else input_file[0], format=format) elif mode == "parse": seq_dict = OrderedDict() for record in SearchIO.parse(input_file if isinstance(input_file, str) else input_file[0], format=format): seq_dict[record.id] = record #seq_dict = SeqIO.to_dict(SeqIO.parse(input_file if isinstance(input_file, str) else input_file[0], format=format)) return seq_dict
def check_index(self, filename, format, **kwargs): # check if Python3 installation has sqlite3 try: import sqlite3 except ImportError: sqlite3 = None if filename.endswith(".bgz"): handle = gzip.open(filename) parsed = list(SearchIO.parse(handle, format, **kwargs)) handle.close() else: parsed = list(SearchIO.parse(filename, format, **kwargs)) # compare values by index indexed = SearchIO.index(filename, format, **kwargs) self.assertEqual( len(parsed), len(indexed), "Should be %i records in %s, index says %i" % (len(parsed), filename, len(indexed))) # compare values by index_db, only if sqlite3 is present if sqlite3 is not None: db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs) self.assertEqual( len(parsed), len(db_indexed), "Should be %i records in %s, index_db says %i" % (len(parsed), filename, len(db_indexed))) for qres in parsed: idx_qres = indexed[qres.id] # parsed and indexed qresult are different objects! self.assertNotEqual(id(qres), id(idx_qres)) # but they should have the same attribute values self.assertTrue(compare_search_obj(qres, idx_qres)) # sqlite3 comparison, only if it's present if sqlite3 is not None: dbidx_qres = db_indexed[qres.id] self.assertNotEqual(id(qres), id(dbidx_qres)) self.assertTrue(compare_search_obj(qres, dbidx_qres)) indexed.close() if sqlite3 is not None: db_indexed.close() db_indexed._con.close() if os.path.isfile(filename + ".bgz"): # Do the tests again with the BGZF compressed file print("[BONUS %s.bgz]" % filename) self.check_index(filename + ".bgz", format, **kwargs)
def check_index(self, filename, format, **kwargs): # check if Python3 installation has sqlite3 try: import sqlite3 except ImportError: sqlite3 = None if filename.endswith(".bgz"): handle = gzip.open(filename) parsed = list(SearchIO.parse(handle, format, **kwargs)) handle.close() else: parsed = list(SearchIO.parse(filename, format, **kwargs)) # compare values by index indexed = SearchIO.index(filename, format, **kwargs) self.assertEqual(len(parsed), len(indexed), "Should be %i records in %s, index says %i" % (len(parsed), filename, len(indexed))) # compare values by index_db, only if sqlite3 is present if sqlite3 is not None: db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs) self.assertEqual(len(parsed), len(db_indexed), "Should be %i records in %s, index_db says %i" % (len(parsed), filename, len(db_indexed))) for qres in parsed: idx_qres = indexed[qres.id] # parsed and indexed qresult are different objects! self.assertNotEqual(id(qres), id(idx_qres)) # but they should have the same attribute values self.assertTrue(compare_search_obj(qres, idx_qres)) # sqlite3 comparison, only if it's present if sqlite3 is not None: dbidx_qres = db_indexed[qres.id] self.assertNotEqual(id(qres), id(dbidx_qres)) self.assertTrue(compare_search_obj(qres, dbidx_qres)) indexed.close() if sqlite3 is not None: db_indexed.close() db_indexed._con.close() if os.path.isfile(filename + ".bgz"): # Do the tests again with the BGZF compressed file print("[BONUS %s.bgz]" % filename) self.check_index(filename + ".bgz", format, **kwargs)
def check_raw(self, filename, id, raw, **kwargs): """Index filename using **kwargs, check get_raw(id)==raw.""" idx = SearchIO.index(filename, self.fmt, **kwargs) raw = _as_bytes(raw) self.assertEqual(raw, idx.get_raw(id)) idx.close() #Now again, but using SQLite backend if sqlite3: idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs) self.assertEqual(raw, idx.get_raw(id)) idx.close() if os.path.isfile(filename + ".bgz"): #Do the tests again with the BGZF compressed file print "[BONUS %s.bgz]" % filename self.check_raw(filename + ".bgz", id, raw, **kwargs)
def check_index(self, filename, format, **kwargs): if filename.endswith(".bgz"): with gzip.open(filename) as handle: parsed = list(SearchIO.parse(handle, format, **kwargs)) else: parsed = list(SearchIO.parse(filename, format, **kwargs)) # compare values by index indexed = SearchIO.index(filename, format, **kwargs) self.assertEqual( len(parsed), len(indexed), "Should be %i records in %s, index says %i" % (len(parsed), filename, len(indexed)), ) # compare values by index_db, only if sqlite3 is present if sqlite3 is not None: db_indexed = SearchIO.index_db(":memory:", [filename], format, **kwargs) self.assertEqual( len(parsed), len(db_indexed), "Should be %i records in %s, index_db says %i" % (len(parsed), filename, len(db_indexed)), ) for qres in parsed: idx_qres = indexed[qres.id] # parsed and indexed qresult are different objects! self.assertNotEqual(id(qres), id(idx_qres)) # but they should have the same attribute values self.compare_search_obj(qres, idx_qres) # sqlite3 comparison, only if it's present if sqlite3 is not None: dbidx_qres = db_indexed[qres.id] self.assertNotEqual(id(qres), id(dbidx_qres)) self.compare_search_obj(qres, dbidx_qres) indexed.close() if sqlite3 is not None: db_indexed.close() db_indexed._con.close() if os.path.isfile(filename + ".bgz"): # Do the tests again with the BGZF compressed file print(f"[BONUS {filename}.bgz]") self.check_index(filename + ".bgz", format, **kwargs)
def handle_input(filename): sys.stdout.write("Handling %s\n" % filename) prefix = FileRoutines.split_filename(filename)[1] index_file = "%s.tmp.idx" % prefix hmm_dict = SearchIO.index_db(index_file, filename, args.format) if args.output == "stdout": out_fd = sys.stdout else: out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w") out_fd.write("#query\thit\tevalue\tbitscore\n") for family in hmm_dict: #print hmm_dict[key] for hit in hmm_dict[family]: if hit.is_included: out_fd.write("%s\t%s\t%s\t%s\n" % (family, hit.id, hit.evalue, hit.bitscore)) if args.output != "stdout": out_fd.close() os.remove(index_file)