示例#1
0
def batch_rename(folderpath, oldcharactor, newcharactor):
    '''Batch replace file name charactor in folder'''
    for f in listdir(folderpath):
        fp = pjoin(folderpath, f)
        if pisfile(fp):
            # print(pjoin(folderpath,f[:-4]+'c.jpg'))
            nf = f.replace(oldcharactor, newcharactor)
            os.rename(fp, pjoin(folderpath, nf))
示例#2
0
    def search(self, in_file, seed_orthologs_file, hits_file):
        hits_generator = None

        if not MMSEQS2:
            raise EmapperException("%s command not found in path" % (MMSEQS2))

        self.in_file = in_file

        try:
            cmds = None

            # 1) either resume from previous hits or run diamond to generate the hits
            if self.resume == True:
                if pisfile(hits_file):
                    pass
                else:
                    raise EmapperException(
                        f"Couldn't find hits file {hits_file} to resume.")
            else:
                querydb = pjoin(self.temp_dir, uuid.uuid4().hex)
                # print(f'Querydb {querydb}')
                resultdb = pjoin(self.temp_dir, uuid.uuid4().hex)
                # print(f'ResultDB {resultdb}')
                bestresultdb = pjoin(self.temp_dir, uuid.uuid4().hex)
                # print(f'BestResultDB {bestresultdb}')

                alignmentsdb, cmds = self.run_mmseqs(in_file, querydb,
                                                     self.targetdb, resultdb,
                                                     bestresultdb)
                shutil.copyfile(f'{alignmentsdb}.m8', hits_file)

            # 2) parse search hits to seeds orthologs
            if self.itype == ITYPE_CDS or self.itype == ITYPE_PROTS:
                hits_generator = self._parse_mmseqs(hits_file)

            else:  #self.itype == ITYPE_GENOME or self.itype == ITYPE_META:
                # parse_genepred (without coordinate change)
                hits_generator = self._parse_genepred(hits_file)

            # 3) output seeds
            if self.itype == ITYPE_CDS or self.itype == ITYPE_PROTS:
                change_seeds_coords = False
            else:  #self.itype == ITYPE_GENOME or self.itype == ITYPE_META:
                # change seeds coordinates relative to the ORF, not to the contig (to use them for the .seed_orthologs file)
                change_seeds_coords = True

            hits_generator = output_seeds(cmds, hits_generator,
                                          seed_orthologs_file,
                                          self.no_file_comments, False,
                                          change_seeds_coords)

        except Exception as e:
            raise e

        return hits_generator
示例#3
0
    def search(self, in_file, seed_orthologs_file, hits_file):
        hits_generator = None

        if not DIAMOND:
            raise EmapperException("%s command not found in path" % (DIAMOND))

        self.in_file = in_file

        try:
            cmds = None

            # 1) either resume from previous hits or run diamond to generate the hits
            if self.resume == True:
                if pisfile(hits_file):
                    pass
                else:
                    raise EmapperException(
                        f"Couldn't find hits file {hits_file} to resume.")
            else:
                cmds = self.run_diamond(in_file, hits_file)

            # 2) parse search hits to seeds orthologs
            if self.itype == ITYPE_CDS or self.itype == ITYPE_PROTS:
                hits_generator = self._parse_diamond(hits_file)

            else:  #self.itype == ITYPE_GENOME or self.itype == ITYPE_META:
                # parse_genepred (without coordinate change)
                hits_generator = self._parse_genepred(hits_file)

            # 3) output seeds
            if self.itype == ITYPE_CDS or self.itype == ITYPE_PROTS:
                change_seeds_coords = False
            else:  #self.itype == ITYPE_GENOME or self.itype == ITYPE_META:
                # change seeds coordinates relative to the ORF, not to the contig (to use them for the .seed_orthologs file)
                change_seeds_coords = True

            hits_generator = output_seeds(cmds, hits_generator,
                                          seed_orthologs_file,
                                          self.no_file_comments,
                                          self.outfmt_short,
                                          change_seeds_coords)

        except Exception as e:
            raise e

        return hits_generator
示例#4
0
    def refine_matches(self, dbname, in_file, refine_file, hits_file):
        refine_header = map(
            str.strip, '''query_name, best_hit_eggNOG_ortholog,
                            best_hit_evalue, best_hit_score'''.split(','))

        print(colorify("Hit refinement starts now", 'green'))
        start_time = time.time()

        # Cache previous results if resuming is enabled
        last_resumed_query = None
        if self.resume == True:
            if pisfile(hits_file):
                if pisfile(refine_file):
                    hits_parser = parse_seeds(refine_file)
                    for hit in hits_parser:
                        yield hit
                        last_resumed_query = hit[0]
            else:
                raise EmapperException(
                    f"Couldn't find hits file {hits_file} to resume.")

            OUT = open(refine_file, 'a')
        else:
            OUT = open(refine_file, 'w')

        if not self.no_file_comments:
            print(self.get_call_info(), file=OUT)
            if self.resume == False:
                print('# ' + '\t'.join(refine_header), file=OUT)

        qn = -1  # in case no hits in loop bellow
        sequences = {
            name: seq
            for name, seq in iter_fasta_seqs(in_file,
                                             translate=self.translate,
                                             trans_table=self.trans_table)
        }
        self.queries = set(sequences.keys())
        for qn, r in enumerate(
                self.process_nog_hits_file(dbname,
                                           hits_file,
                                           sequences,
                                           last_resumed_query,
                                           cpu=self.cpu,
                                           excluded_taxa=self.excluded_taxa)):
            if qn and (qn % 25 == 0):
                total_time = time.time() - start_time
                print(str(qn + 1) + " " + str(total_time) +
                      " %0.2f q/s (refinement)" %
                      ((float(qn + 1) / total_time)),
                      file=sys.stderr)
                sys.stderr.flush()
            query_name = r[0]
            best_hit_name = r[1]
            if best_hit_name == '-' or best_hit_name == 'ERROR':
                continue
            best_hit_evalue = float(r[2])
            best_hit_score = float(r[3])
            print('\t'.join(
                map(str, (query_name, best_hit_name, best_hit_evalue,
                          best_hit_score))),
                  file=OUT)

            yield [query_name, best_hit_name, best_hit_evalue, best_hit_score]
            #OUT.flush()

        elapsed_time = time.time() - start_time
        if not self.no_file_comments:
            print('## %d queries scanned' % (qn + 1), file=OUT)
            print('## Total time (seconds): ' + str(elapsed_time), file=OUT)
            print('## Rate: ' + "%0.2f q/s" % ((float(qn + 1) / elapsed_time)),
                  file=OUT)
        OUT.close()
        print(colorify(" Processed queries:%s total_time:%s rate:%s" %\
                       (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue'))
        return
示例#5
0
    def dump_hmm_matches(self,
                         in_file,
                         hits_file,
                         dbpath,
                         port,
                         servers,
                         idmap_file,
                         silent=False):
        hits_header = ("query_name", "hit", "evalue", "sum_score",
                       "query_length", "hmmfrom", "hmmto", "seqfrom", "seqto",
                       "query_coverage")

        CLANS_FILE = get_pfam_clans_file()

        # Cache previous results if resuming is enabled
        VISITED = set()
        if self.resume and pisfile(hits_file):
            print(
                colorify(
                    "Resuming previous run. Reading computed output from %s" %
                    hits_file, 'yellow'))
            VISITED = set([
                line.split('\t')[0].strip() for line in open(hits_file)
                if not line.startswith('#')
            ])
            print(str(len(VISITED)) + ' queries skipped')
            OUT = open(hits_file, 'a')
        else:
            OUT = open(hits_file, 'w')

        if not self.no_file_comments:
            print(self.get_call_info(), file=OUT)
            if self.resume == False or not pisfile(hits_file):
                print('# ' + '\t'.join(hits_header), file=OUT)

        total_time = 0
        last_time = time.time()
        start_time = time.time()

        # Loading the DB identifiers will also taken into account for total_time
        idmap_idx = None
        if idmap_file:
            idmap_idx = load_idmap_idx(idmap_file)

        if silent == False:
            print(colorify("Sequence mapping starts now!", 'green'))

        if self.clean_overlaps is not None and self.clean_overlaps in [
                CLEAN_OVERLAPS_HMMSEARCH_ALL, CLEAN_OVERLAPS_HMMSEARCH_CLANS
        ]:
            namedhits = []

        qn = -1  # in case nothing to loop bellow
        for name, elapsed, hits, querylen, seq in iter_hits(
                in_file,
                self.translate,
                self.qtype,
                self.dbtype,
                self.scantype,
                dbpath,
                port,
                servers,
                evalue_thr=self.evalue,
                score_thr=self.score,
                qcov_thr=self.qcov,
                fixed_Z=self.Z,
                max_hits=self.maxhits,
                skip=VISITED,
                maxseqlen=self.maxseqlen,
                cut_ga=self.cut_ga,
                cpus=self.cpu,
                base_tempdir=self.hmmcmd_temp_dir,
                silent=silent,
                trans_table=self.trans_table):

            if elapsed == -1:
                # error occurred. hits should contain a single element with the error msg. e.g. hits = ["ERROR_MSG"]
                print('\t'.join([name] + hits * (len(hits_header) - 1)),
                      file=sys.stderr)
                print('\t'.join([name] + ['-'] * (len(hits_header) - 1)),
                      file=OUT)
            elif not hits and self.report_no_hits == True:
                print('\t'.join([name] + ['-'] * (len(hits_header) - 1)),
                      file=OUT)
            else:

                if self.clean_overlaps is not None and self.clean_overlaps in [
                        CLEAN_OVERLAPS_ALL, CLEAN_OVERLAPS_CLANS
                ]:
                    hits = process_overlaps(hits, self.clean_overlaps,
                                            CLANS_FILE, idmap_idx)

                elif self.clean_overlaps is not None and self.clean_overlaps in [
                        CLEAN_OVERLAPS_HMMSEARCH_ALL,
                        CLEAN_OVERLAPS_HMMSEARCH_CLANS
                ]:
                    namedhits.append((name, querylen, hits))

                # output
                if self.clean_overlaps is None or self.clean_overlaps not in [
                        CLEAN_OVERLAPS_HMMSEARCH_ALL,
                        CLEAN_OVERLAPS_HMMSEARCH_CLANS
                ]:
                    self.output_hits(name, querylen, hits, OUT, idmap_idx)

            OUT.flush()

            qn += 1

            # monitoring
            total_time += time.time() - last_time
            last_time = time.time()
            if qn and (qn % 25 == 0):
                if silent == False:
                    print(qn + 1,
                          total_time,
                          "%0.2f q/s" % ((float(qn + 1) / total_time)),
                          file=sys.stderr)
                    sys.stderr.flush()

        if self.clean_overlaps is not None and self.clean_overlaps in [
                CLEAN_OVERLAPS_HMMSEARCH_ALL, CLEAN_OVERLAPS_HMMSEARCH_CLANS
        ]:
            if silent == False:
                sys.stderr.write("Postprocessing overlapping hits...\n")
            namedhits = process_overlaps(namedhits, self.clean_overlaps,
                                         CLANS_FILE, idmap_idx)
            for (name, querylen, hits) in namedhits:
                self.output_hits(name, querylen, hits, OUT, idmap_idx)

        # Writes final stats
        elapsed_time = time.time() - start_time
        if not self.no_file_comments:
            print('## %d queries scanned' % (qn + 1), file=OUT)
            print('## Total time (seconds): ' + str(elapsed_time), file=OUT)
            print('## Rate:',
                  "%0.2f q/s" % ((float(qn + 1) / elapsed_time)),
                  file=OUT)
        OUT.close()
        if silent == False:
            print(colorify(" Processed queries:%s total_time:%s rate:%s" %\
                           (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue'))

        return
示例#6
0
    #open train, dev, test json
    with open(pjoin(args.dataset_folder, 'train.json'), 'r') as f:
        encoder_json = jload(f)  #encoder json is always train.json

    with open(pjoin(args.dataset_folder, args.train_file), 'r') as f:
        train_json = jload(f)

    with open(pjoin(args.dataset_folder, args.dev_file), 'r') as f:
        dev_json = jload(f)

    with open(pjoin(args.dataset_folder, args.test_file), 'r') as f:
        test_json = jload(f)

    # if encoder exists dont create again (time saver)
    if not pisfile(pjoin(args.saving_folder, 'encoder')):
        encoder = imsitu_encoder.imsitu_encoder(encoder_json)
        torch.save(encoder, pjoin(args.saving_folder, 'encoder'))
    else:
        print("Loading encoder file")
        encoder = torch.load(pjoin(args.saving_folder, 'encoder'))

    # create dataloader
    train_set = imsitu_loader.imsitu_loader(args.imgset_dir, train_json,
                                            encoder, encoder.train_transform)
    train_loader = torch.utils.data.DataLoader(train_set,
                                               pin_memory=True,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.num_workers)