def batch_rename(folderpath, oldcharactor, newcharactor): '''Batch replace file name charactor in folder''' for f in listdir(folderpath): fp = pjoin(folderpath, f) if pisfile(fp): # print(pjoin(folderpath,f[:-4]+'c.jpg')) nf = f.replace(oldcharactor, newcharactor) os.rename(fp, pjoin(folderpath, nf))
def search(self, in_file, seed_orthologs_file, hits_file): hits_generator = None if not MMSEQS2: raise EmapperException("%s command not found in path" % (MMSEQS2)) self.in_file = in_file try: cmds = None # 1) either resume from previous hits or run diamond to generate the hits if self.resume == True: if pisfile(hits_file): pass else: raise EmapperException( f"Couldn't find hits file {hits_file} to resume.") else: querydb = pjoin(self.temp_dir, uuid.uuid4().hex) # print(f'Querydb {querydb}') resultdb = pjoin(self.temp_dir, uuid.uuid4().hex) # print(f'ResultDB {resultdb}') bestresultdb = pjoin(self.temp_dir, uuid.uuid4().hex) # print(f'BestResultDB {bestresultdb}') alignmentsdb, cmds = self.run_mmseqs(in_file, querydb, self.targetdb, resultdb, bestresultdb) shutil.copyfile(f'{alignmentsdb}.m8', hits_file) # 2) parse search hits to seeds orthologs if self.itype == ITYPE_CDS or self.itype == ITYPE_PROTS: hits_generator = self._parse_mmseqs(hits_file) else: #self.itype == ITYPE_GENOME or self.itype == ITYPE_META: # parse_genepred (without coordinate change) hits_generator = self._parse_genepred(hits_file) # 3) output seeds if self.itype == ITYPE_CDS or self.itype == ITYPE_PROTS: change_seeds_coords = False else: #self.itype == ITYPE_GENOME or self.itype == ITYPE_META: # change seeds coordinates relative to the ORF, not to the contig (to use them for the .seed_orthologs file) change_seeds_coords = True hits_generator = output_seeds(cmds, hits_generator, seed_orthologs_file, self.no_file_comments, False, change_seeds_coords) except Exception as e: raise e return hits_generator
def search(self, in_file, seed_orthologs_file, hits_file): hits_generator = None if not DIAMOND: raise EmapperException("%s command not found in path" % (DIAMOND)) self.in_file = in_file try: cmds = None # 1) either resume from previous hits or run diamond to generate the hits if self.resume == True: if pisfile(hits_file): pass else: raise EmapperException( f"Couldn't find hits file {hits_file} to resume.") else: cmds = self.run_diamond(in_file, hits_file) # 2) parse search hits to seeds orthologs if self.itype == ITYPE_CDS or self.itype == ITYPE_PROTS: hits_generator = self._parse_diamond(hits_file) else: #self.itype == ITYPE_GENOME or self.itype == ITYPE_META: # parse_genepred (without coordinate change) hits_generator = self._parse_genepred(hits_file) # 3) output seeds if self.itype == ITYPE_CDS or self.itype == ITYPE_PROTS: change_seeds_coords = False else: #self.itype == ITYPE_GENOME or self.itype == ITYPE_META: # change seeds coordinates relative to the ORF, not to the contig (to use them for the .seed_orthologs file) change_seeds_coords = True hits_generator = output_seeds(cmds, hits_generator, seed_orthologs_file, self.no_file_comments, self.outfmt_short, change_seeds_coords) except Exception as e: raise e return hits_generator
def refine_matches(self, dbname, in_file, refine_file, hits_file): refine_header = map( str.strip, '''query_name, best_hit_eggNOG_ortholog, best_hit_evalue, best_hit_score'''.split(',')) print(colorify("Hit refinement starts now", 'green')) start_time = time.time() # Cache previous results if resuming is enabled last_resumed_query = None if self.resume == True: if pisfile(hits_file): if pisfile(refine_file): hits_parser = parse_seeds(refine_file) for hit in hits_parser: yield hit last_resumed_query = hit[0] else: raise EmapperException( f"Couldn't find hits file {hits_file} to resume.") OUT = open(refine_file, 'a') else: OUT = open(refine_file, 'w') if not self.no_file_comments: print(self.get_call_info(), file=OUT) if self.resume == False: print('# ' + '\t'.join(refine_header), file=OUT) qn = -1 # in case no hits in loop bellow sequences = { name: seq for name, seq in iter_fasta_seqs(in_file, translate=self.translate, trans_table=self.trans_table) } self.queries = set(sequences.keys()) for qn, r in enumerate( self.process_nog_hits_file(dbname, hits_file, sequences, last_resumed_query, cpu=self.cpu, excluded_taxa=self.excluded_taxa)): if qn and (qn % 25 == 0): total_time = time.time() - start_time print(str(qn + 1) + " " + str(total_time) + " %0.2f q/s (refinement)" % ((float(qn + 1) / total_time)), file=sys.stderr) sys.stderr.flush() query_name = r[0] best_hit_name = r[1] if best_hit_name == '-' or best_hit_name == 'ERROR': continue best_hit_evalue = float(r[2]) best_hit_score = float(r[3]) print('\t'.join( map(str, (query_name, best_hit_name, best_hit_evalue, best_hit_score))), file=OUT) yield [query_name, best_hit_name, best_hit_evalue, best_hit_score] #OUT.flush() elapsed_time = time.time() - start_time if not self.no_file_comments: print('## %d queries scanned' % (qn + 1), file=OUT) print('## Total time (seconds): ' + str(elapsed_time), file=OUT) print('## Rate: ' + "%0.2f q/s" % ((float(qn + 1) / elapsed_time)), file=OUT) OUT.close() print(colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')) return
def dump_hmm_matches(self, in_file, hits_file, dbpath, port, servers, idmap_file, silent=False): hits_header = ("query_name", "hit", "evalue", "sum_score", "query_length", "hmmfrom", "hmmto", "seqfrom", "seqto", "query_coverage") CLANS_FILE = get_pfam_clans_file() # Cache previous results if resuming is enabled VISITED = set() if self.resume and pisfile(hits_file): print( colorify( "Resuming previous run. Reading computed output from %s" % hits_file, 'yellow')) VISITED = set([ line.split('\t')[0].strip() for line in open(hits_file) if not line.startswith('#') ]) print(str(len(VISITED)) + ' queries skipped') OUT = open(hits_file, 'a') else: OUT = open(hits_file, 'w') if not self.no_file_comments: print(self.get_call_info(), file=OUT) if self.resume == False or not pisfile(hits_file): print('# ' + '\t'.join(hits_header), file=OUT) total_time = 0 last_time = time.time() start_time = time.time() # Loading the DB identifiers will also taken into account for total_time idmap_idx = None if idmap_file: idmap_idx = load_idmap_idx(idmap_file) if silent == False: print(colorify("Sequence mapping starts now!", 'green')) if self.clean_overlaps is not None and self.clean_overlaps in [ CLEAN_OVERLAPS_HMMSEARCH_ALL, CLEAN_OVERLAPS_HMMSEARCH_CLANS ]: namedhits = [] qn = -1 # in case nothing to loop bellow for name, elapsed, hits, querylen, seq in iter_hits( in_file, self.translate, self.qtype, self.dbtype, self.scantype, dbpath, port, servers, evalue_thr=self.evalue, score_thr=self.score, qcov_thr=self.qcov, fixed_Z=self.Z, max_hits=self.maxhits, skip=VISITED, maxseqlen=self.maxseqlen, cut_ga=self.cut_ga, cpus=self.cpu, base_tempdir=self.hmmcmd_temp_dir, silent=silent, trans_table=self.trans_table): if elapsed == -1: # error occurred. hits should contain a single element with the error msg. e.g. hits = ["ERROR_MSG"] print('\t'.join([name] + hits * (len(hits_header) - 1)), file=sys.stderr) print('\t'.join([name] + ['-'] * (len(hits_header) - 1)), file=OUT) elif not hits and self.report_no_hits == True: print('\t'.join([name] + ['-'] * (len(hits_header) - 1)), file=OUT) else: if self.clean_overlaps is not None and self.clean_overlaps in [ CLEAN_OVERLAPS_ALL, CLEAN_OVERLAPS_CLANS ]: hits = process_overlaps(hits, self.clean_overlaps, CLANS_FILE, idmap_idx) elif self.clean_overlaps is not None and self.clean_overlaps in [ CLEAN_OVERLAPS_HMMSEARCH_ALL, CLEAN_OVERLAPS_HMMSEARCH_CLANS ]: namedhits.append((name, querylen, hits)) # output if self.clean_overlaps is None or self.clean_overlaps not in [ CLEAN_OVERLAPS_HMMSEARCH_ALL, CLEAN_OVERLAPS_HMMSEARCH_CLANS ]: self.output_hits(name, querylen, hits, OUT, idmap_idx) OUT.flush() qn += 1 # monitoring total_time += time.time() - last_time last_time = time.time() if qn and (qn % 25 == 0): if silent == False: print(qn + 1, total_time, "%0.2f q/s" % ((float(qn + 1) / total_time)), file=sys.stderr) sys.stderr.flush() if self.clean_overlaps is not None and self.clean_overlaps in [ CLEAN_OVERLAPS_HMMSEARCH_ALL, CLEAN_OVERLAPS_HMMSEARCH_CLANS ]: if silent == False: sys.stderr.write("Postprocessing overlapping hits...\n") namedhits = process_overlaps(namedhits, self.clean_overlaps, CLANS_FILE, idmap_idx) for (name, querylen, hits) in namedhits: self.output_hits(name, querylen, hits, OUT, idmap_idx) # Writes final stats elapsed_time = time.time() - start_time if not self.no_file_comments: print('## %d queries scanned' % (qn + 1), file=OUT) print('## Total time (seconds): ' + str(elapsed_time), file=OUT) print('## Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time)), file=OUT) OUT.close() if silent == False: print(colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')) return
#open train, dev, test json with open(pjoin(args.dataset_folder, 'train.json'), 'r') as f: encoder_json = jload(f) #encoder json is always train.json with open(pjoin(args.dataset_folder, args.train_file), 'r') as f: train_json = jload(f) with open(pjoin(args.dataset_folder, args.dev_file), 'r') as f: dev_json = jload(f) with open(pjoin(args.dataset_folder, args.test_file), 'r') as f: test_json = jload(f) # if encoder exists dont create again (time saver) if not pisfile(pjoin(args.saving_folder, 'encoder')): encoder = imsitu_encoder.imsitu_encoder(encoder_json) torch.save(encoder, pjoin(args.saving_folder, 'encoder')) else: print("Loading encoder file") encoder = torch.load(pjoin(args.saving_folder, 'encoder')) # create dataloader train_set = imsitu_loader.imsitu_loader(args.imgset_dir, train_json, encoder, encoder.train_transform) train_loader = torch.utils.data.DataLoader(train_set, pin_memory=True, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers)