Exemplo n.º 1
0
def get_genome_in_bit(chr_fa_folder):
    ''' encode each chromosome fasta sequence into a bitarray,
        and store them in a dictionary with chr numbers as keys
        chr_fa_folder is the folder to put all gzipped fasta files:

        fasta files can be downloaded from NCBI FTP site:

        ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/
        chr<i>.fa.gz  (e.g. chr1.fa.gz)

    '''
    chr_bit_d = {}
    chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT']
    t0 = time.time()
    for i in chr_range:
        t1 = time.time()
        #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i)
        file_name = 'chr{}.fa.gz'.format(i)
        print("Loading {}...".format(file_name), end='')
        file_name = os.path.join(chr_fa_folder, file_name)
        with open_anyfile(file_name) as seq_f:
            seq_f.readline()  # skip header
            seq_bit = bitarray()
            for line in seq_f:
                line = line.rstrip('\n')
                line_bit = nuc_to_bit(line)
                seq_bit += line_bit
            chr_bit_d.update({i: seq_bit})
        print("done.[{}]".format(timesofar(t1)))
    print('=' * 20)
    print("Finished. [{}]".format(timesofar(t0)))

    return chr_bit_d
Exemplo n.º 2
0
    def process(self, iterable, batch_size):
        self.logger.info("Uploading to the DB...")
        t0 = time.time()
        tinner = time.time()
        total = 0
        for doc_li in self.doc_iterator(iterable,
                                        batch=True,
                                        batch_size=batch_size):
            try:
                bob = self.temp_collection.initialize_unordered_bulk_op()
                for d in doc_li:
                    bob.insert(d)
                res = bob.execute()
                total += res['nInserted']
                self.logger.info("Inserted %s records [%s]" %
                                 (res['nInserted'], timesofar(tinner)))
            except BulkWriteError as e:
                self.logger.info(
                    "Inserted %s records, ignoring %d [%s]" %
                    (e.details['nInserted'], len(
                        e.details["writeErrors"]), timesofar(tinner)))
            except Exception as e:
                raise
            tinner = time.time()
        self.logger.info('Done[%s]' % timesofar(t0))

        return total
Exemplo n.º 3
0
    def process(self, doc_d, batch_size):
        self.logger.info("Uploading to the DB...")
        t0 = time.time()
        tinner = time.time()
        # force step = 1
        cnt = 0
        total = 0
        dups = 0
        for doc_li in self.doc_iterator(doc_d, batch=True, batch_size=1):
            try:
                res = self.temp_collection.insert(doc_li,
                                                  manipulate=False,
                                                  check_keys=False)
                cnt += 1
                total += 1
                if (cnt + dups) % batch_size == 0:
                    # we insert one by one but display progress on a "batch_size" base
                    self.logger.info("Inserted %s records, ignoring %s [%s]" %
                                     (cnt, dups, timesofar(tinner)))
                    cnt = 0
                    dups = 0
                    tinner = time.time()
            except DuplicateKeyError:
                dups += 1
                pass
        self.logger.info('Done[%s]' % timesofar(t0))

        return total
Exemplo n.º 4
0
def redo_parse_gbff(path):
    '''call this function manually to re-start the parsing step and set src_dump.
       This is used when main() is broken at parsing step, then parsing need to be re-started
       after the fix.
    '''
    #mark the download starts
    src_dump = get_src_dump()

    t0 = time.time()
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(path)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True  # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Exemplo n.º 5
0
 def refresh_commands(cls):
     for num, info in sorted(cls.launched_commands.items()):
         # already process, this current command is now history
         # Note: if we have millions of commands there, it could last quite a while,
         # but IRL we only have a few
         if info.get("is_done") == True:
             continue
         # is_done = set([j.done() for j in info["jobs"]]) == set([True])   # TODO: remove this line
         is_done = {j.done() for j in info["jobs"]} == {True}
         has_err = is_done and [
             True for j in info["jobs"] if j.exception()
         ] or None
         localoutputs = is_done and (
             [str(j.exception()) for j in info["jobs"] if j.exception()]
             or [j.result() for j in info["jobs"]]) or None
         if is_done:
             cls.launched_commands[num]["is_done"] = True
             cls.launched_commands[num][
                 "failed"] = has_err and has_err[0] or False
             cls.launched_commands[num]["results"] = localoutputs
             cls.launched_commands[num]["finished_at"] = time.time()
             cls.launched_commands[num]["duration"] = timesofar(
                 t0=cls.launched_commands[num]["started_at"],
                 t1=cls.launched_commands[num]["finished_at"])
             cls.save_cmd(num, cls.launched_commands[num])
             if not has_err and localoutputs and set(map(
                     type, localoutputs)) == {str}:
                 localoutputs = "\n" + "".join(localoutputs)
             cls.pending_outputs[num] = "[%s] %s {%s} %s: finished %s " % \
                 (num, has_err and "ERR" or "OK", timesofar(info["started_at"]), info["cmd"], localoutputs)
         else:
             cls.pending_outputs[num] = "[%s] RUN {%s} %s" % (
                 num, timesofar(info["started_at"]), info["cmd"])
Exemplo n.º 6
0
def redo_parse_gbff(path):
    '''call this function manually to re-start the parsing step and set src_dump.
       This is used when main() is broken at parsing step, then parsing need to be re-started
       after the fix.
    '''
    #mark the download starts
    src_dump = get_src_dump()

    t0 = time.time()
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(path)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Exemplo n.º 7
0
def get_genome_in_bit(chr_fa_folder):
    ''' encode each chromosome fasta sequence into a bitarray,
        and store them in a dictionary with chr numbers as keys
        chr_fa_folder is the folder to put all gzipped fasta files:

        fasta files can be downloaded from NCBI FTP site:

        ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/
        chr<i>.fa.gz  (e.g. chr1.fa.gz)

    '''
    chr_bit_d = {}
    chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT']
    t0 = time.time()
    for i in chr_range:
        t1 = time.time()
        #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i)
        file_name = 'chr{}.fa.gz'.format(i)
        print("Loading {}...".format(file_name), end='')
        file_name = os.path.join(chr_fa_folder, file_name)
        with open_anyfile(file_name) as seq_f:
            seq_f.readline()   # skip header
            seq_bit = bitarray()
            for line in seq_f:
                line = line.rstrip('\n')
                line_bit = nuc_to_bit(line)
                seq_bit += line_bit
            chr_bit_d.update({i: seq_bit})
        print("done.[{}]".format(timesofar(t1)))
    print('='*20)
    print("Finished. [{}]".format(timesofar(t0)))

    return chr_bit_d
Exemplo n.º 8
0
    def process(self, doc_d, batch_size):
        self.logger.info("Uploading to the DB...")
        t0 = time.time()
        tinner = time.time()
        aslistofdict = None
        total = 0
        for doc_li in self.doc_iterator(doc_d,
                                        batch=True,
                                        batch_size=batch_size):
            toinsert = len(doc_li)
            nbinsert = 0
            self.logger.info("Inserting %s records ... " % toinsert)
            try:
                bob = self.temp_collection.initialize_unordered_bulk_op()
                for d in doc_li:
                    aslistofdict = d.pop("__aslistofdict__", None)
                    bob.insert(d)
                res = bob.execute()
                nbinsert += res["nInserted"]
                self.logger.info("OK [%s]" % timesofar(tinner))
            except BulkWriteError as e:
                inserted = e.details["nInserted"]
                nbinsert += inserted
                self.logger.info("Fixing %d records " %
                                 len(e.details["writeErrors"]))
                ids = [d["op"]["_id"] for d in e.details["writeErrors"]]
                # build hash of existing docs
                docs = self.temp_collection.find({"_id": {"$in": ids}})
                hdocs = {}
                for doc in docs:
                    hdocs[doc["_id"]] = doc
                bob2 = self.temp_collection.initialize_unordered_bulk_op()
                for err in e.details["writeErrors"]:
                    errdoc = err["op"]
                    existing = hdocs[errdoc["_id"]]
                    assert "_id" in existing
                    _id = errdoc.pop("_id")
                    merged = merge_struct(errdoc,
                                          existing,
                                          aslistofdict=aslistofdict)
                    bob2.find({"_id": _id}).update_one({"$set": merged})
                    # update previously fetched doc. if several errors are about the same doc id,
                    # we would't merged things properly without an updated document
                    assert "_id" in merged
                    hdocs[_id] = merged
                    nbinsert += 1

                res = bob2.execute()
                self.logger.info("OK [%s]" % timesofar(tinner))
            assert nbinsert == toinsert, "nb %s to %s" % (nbinsert, toinsert)
            # end of loop so it counts the time spent in doc_iterator
            tinner = time.time()
            total += nbinsert

        self.logger.info('Done[%s]' % timesofar(t0))
        self.switch_collection()
        self.post_update_data()

        return total
Exemplo n.º 9
0
def update_index(changes, sync_src, sync_target, noconfirm=False):
    # changes['_add'] = changes['delete']
    # changes['_delete'] = changes['add']
    # changes['delete'] = changes['_delete']
    # changes['add'] = changes['_add']
    # del changes['_add']
    # del changes['_delete']

    print("\t{}\trecords will be added.".format(len(changes['add'])))
    print("\t{}\trecords will be deleted.".format(len(changes['delete'])))
    print("\t{}\trecords will be updated.".format(len(changes['update'])))

    print()
    print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name,
                                         sync_src.name))
    print('\tsync_target\t{:<45}{}'.format(
        sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name))

    if noconfirm or ask("Continue?") == 'Y':
        t00 = time.time()
        es_idxer = sync_target.target_esidxer

        if len(changes['add']) > 0:
            print("Adding {} new records...".format(len(changes['add'])))
            t0 = time.time()
            _q = {'_id': {'$in': changes['add']}}
            for docs in doc_feeder(sync_src.target_collection,
                                   step=1000,
                                   inbatch=True,
                                   query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['delete']) > 0:
            print("Deleting {} old records...".format(len(changes['delete'])))
            t0 = time.time()
            es_idxer.delete_docs(changes['delete'])
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['update']) > 0:
            print("Updating {} existing records...".format(
                len(changes['update'])))
            t0 = time.time()
            ids = [d['_id'] for d in changes['update']]
            _q = {'_id': {'$in': ids}}
            for docs in doc_feeder(sync_src.target_collection,
                                   step=1000,
                                   inbatch=True,
                                   query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))
        print('=' * 20)
        print('Finished. [{}]'.format(timesofar(t00)))
Exemplo n.º 10
0
    def load(self, genedoc_d=None, update_data=True, update_master=True, test=False, step=10000):
        if not self.temp_collection:
            self.make_temp_collection()

        self.temp_collection.drop()       # drop all existing records just in case.

        if update_data:
            genedoc_d = genedoc_d or self.load_genedoc()
            print("genedoc_d mem: %s" % sys.getsizeof(genedoc_d))

            print("Uploading to the DB...", end='')
            t0 = time.time()
            # for doc in self.doc_iterator(genedoc_d, batch=False):
            #     if not test:
            #         doc.save()
            for doc_li in self.doc_iterator(genedoc_d, batch=True, step=step):
                if not test:
                    self.temp_collection.insert(doc_li, manipulate=False, check_keys=False)
            print('Done[%s]' % timesofar(t0))
            self.switch_collection()

            if getattr(self, 'ENTREZ_GENEDOC_ROOT', False):
                print('Uploading "geneid_d" to GridFS...', end='')
                t0 = time.time()
                geneid_d = self.get_geneid_d()
                dump2gridfs(geneid_d, self.__collection__ + '__geneid_d.pyobj', self.db)
                print('Done[%s]' % timesofar(t0))
            if getattr(self, 'ENSEMBL_GENEDOC_ROOT', False):
                print('Uploading "mapping2entrezgene" to GridFS...', end='')
                t0 = time.time()
                x2entrezgene_list = self.get_mapping_to_entrez()
                dump2gridfs(x2entrezgene_list, self.__collection__ + '__2entrezgene_list.pyobj', self.db)
                print('Done[%s]' % timesofar(t0))

        if update_master:
            # update src_master collection
            if not test:
                _doc = {"_id": str(self.__collection__),
                        "name": str(self.__collection__),
                        "timestamp": datetime.datetime.now()}
                for attr in ['ENTREZ_GENEDOC_ROOT', 'ENSEMBL_GENEDOC_ROOT', 'id_type']:
                    if hasattr(self, attr):
                        _doc[attr] = getattr(self, attr)
                if hasattr(self, 'get_mapping'):
                    _doc['mapping'] = getattr(self, 'get_mapping')()

                coll = conn[GeneDocSourceMaster.__database__][GeneDocSourceMaster.__collection__]
                dkey = {"_id": _doc["_id"]}
                prev = coll.find_one(dkey)
                if prev:
                    coll.replace_one(dkey, _doc)
                else:
                    coll.insert_one(_doc)
Exemplo n.º 11
0
def doc_feeder(collection, step=1000, s=None, e=None, inbatch=False, query=None, batch_callback=None, fields=None):
    '''A iterator for returning docs in a collection, with batch query.
       additional filter query can be passed via "query", e.g.,
       doc_feeder(collection, query={'taxid': {'$in': [9606, 10090, 10116]}})
       batch_callback is a callback function as fn(cnt, t), called after every batch
       fields is optional parameter passed to find to restrict fields to return.
    '''
    cur = collection.find(query, no_cursor_timeout=False, projection=fields)
    n = cur.count()
    s = s or 0
    e = e or n
    print('Retrieving %d documents from database "%s".' % (n, collection.name))
    t0 = time.time()
    if inbatch:
        doc_li = []
    cnt = 0
    t1 = time.time()
    try:
        if s:
            cur.skip(s)
            cnt = s
            print("Skipping %d documents." % s)
        if e:
            cur.limit(e - (s or 0))
        cur.batch_size(step)
        print("Processing %d-%d documents..." % (cnt + 1, min(cnt + step, e)), end='')
        for doc in cur:
            if inbatch:
                doc_li.append(doc)
            else:
                yield doc
            cnt += 1
            if cnt % step == 0:
                if inbatch:
                    yield doc_li
                    doc_li = []
                print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1)))
                if batch_callback:
                    batch_callback(cnt, time.time()-t1)
                if cnt < e:
                    t1 = time.time()
                    print("Processing %d-%d documents..." % (cnt + 1, min(cnt + step, e)), end='')
        if inbatch and doc_li:
            #Important: need to yield the last batch here
            yield doc_li

        #print 'Done.[%s]' % timesofar(t1)
        print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1)))
        print("=" * 20)
        print('Finished.[total time: %s]' % timesofar(t0))
    finally:
        cur.close()
Exemplo n.º 12
0
    def apply_changes(self, changes, verify=True, noconfirm=False):
        if verify:
            self.pre_verify_changes(changes)

        if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'):
            print("Aborted.")
            return -1
        step = self.step
        _db = get_target_db()
        source_col = _db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocESBackend(self)
        _timestamp = changes['timestamp']

        def _add_docs(ids):
            i = 0
            for _ids in iter_n(ids, step):
                t1 = time.time()
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                    i += 1
                target.insert(_doc_li)
                print('\t{}\t{}'.format(i, timesofar(t1)))

        t0 = time.time()
        if changes['add']:
            print("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            _add_docs(changes['add'])
            print("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            print("Deleting {} discontinued docs...".format(
                len(changes['delete'])),
                  end='')
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            print("done. [{}]".format(timesofar(t00)))
        if changes['update']:
            print("Updating {} existing docs...".format(len(
                changes['update'])))
            t00 = time.time()
            ids = [x['_id'] for x in changes['update']]
            _add_docs(ids)
            print("done. [{}]".format(timesofar(t00)))

        target.finalize()

        print("\n")
        print("Finished.", timesofar(t0))
Exemplo n.º 13
0
def two_docs_iterator(b1, b2, id_list, step=10000):
    t0 = time.time()
    n = len(id_list)
    for i in range(0, n, step):
        t1 = time.time()
        print("Processing %d-%d documents..." % (i + 1, min(i + step, n)), end='')
        _ids = id_list[i:i+step]
        iter1 = b1.mget_from_ids(_ids, asiter=True)
        iter2 = b2.mget_from_ids(_ids, asiter=True)
        for doc1, doc2 in zip(iter1, iter2):
            yield doc1, doc2
        print('Done.[%.1f%%,%s]' % (i*100./n, timesofar(t1)))
    print("="*20)
    print('Finished.[total time: %s]' % timesofar(t0))
Exemplo n.º 14
0
    def check_src_upload(self):
        running_processes = self.running_processes_upload
        jobs_finished = []
        if running_processes:
            self.idle = True
            print('Dispatcher:  {} active job(s)'.format(
                len(running_processes)))
            print(get_process_info(running_processes))

        for src in running_processes:
            p = running_processes[src]
            returncode = p.poll()
            if returncode is None:
                p.log_f.flush()
            else:
                t1 = round(time.time() - p.t0, 0)
                d = {
                    'upload.returncode': returncode,
                    'upload.timestamp': datetime.now(),
                    'upload.time_in_s': t1,
                    'upload.time': timesofar(p.t0),
                    'upload.logfile': p.logfile,
                    'upload.status': "success" if returncode == 0 else "failed"
                }
                mark_upload_done(src, d)
                jobs_finished.append(src)
                p.log_f.close()

                if returncode == 0:
                    msg = 'Dispatcher:  "{}" uploader finished successfully with code {} (time: {})'.format(
                        src, returncode, timesofar(p.t0, t1=t1))
                    print(msg)
                    if hipchat_msg:
                        msg += '<a href="{}/log/dump/{}">dump log</a>'.format(
                            DATA_WWW_ROOT_URL, src)
                        msg += '<a href="{}/log/upload/{}">upload log</a>'.format(
                            DATA_WWW_ROOT_URL, src)
                        hipchat_msg(msg, message_format='html', color="green")
                    source_upload_success.send(self, src_name=src)
                else:
                    msg = 'Dispatcher:  "{}" uploader failed with code {} (time: {}s)'.format(
                        src, returncode, t1)
                    print(msg)
                    if hipchat_msg:
                        hipchat_msg(msg, color="red")
                    source_upload_failed.send(self, src_name=src)

        for src in jobs_finished:
            del running_processes[src]
Exemplo n.º 15
0
 def checkmem(self, pinfo=None):
     mem_req = pinfo and pinfo.get("__reqs__", {}).get("mem") or 0
     t0 = time.time()
     waited = False
     sleep_time = 5
     if mem_req:
         logger.info("Job {cat:%s,source:%s,step:%s} requires %s memory, checking if available" % \
                 (pinfo.get("category"), pinfo.get("source"), pinfo.get("step"), sizeof_fmt(mem_req)))
     if self.max_memory_usage:
         hub_mem = self.hub_memory
         while hub_mem >= self.max_memory_usage:
             logger.info("Hub is using too much memory to launch job {cat:%s,source:%s,step:%s} (%s used, more than max allowed %s), wait a little (job's already been postponed for %s)" % \
                     (pinfo.get("category"), pinfo.get("source"), pinfo.get("step"), sizeof_fmt(hub_mem),
                      sizeof_fmt(self.max_memory_usage),timesofar(t0)))
             yield from asyncio.sleep(sleep_time)
             waited = True
             hub_mem = self.hub_memory
     if mem_req:
         # max allowed mem is either the limit we gave and the os limit
         max_mem = self.max_memory_usage and self.max_memory_usage or self.avail_memory
         # TODO: check projected memory (jobs with mem requirements currently running
         # as those jobs may not have reached their max mem usage yet)
         hub_mem = self.hub_memory
         while mem_req >= (max_mem - hub_mem):
             logger.info("Job {cat:%s,source:%s,step:%s} needs %s to run, not enough to launch it (hub consumes %s while max allowed is %s), wait a little  (job's already been postponed for %s)" % \
                     (pinfo.get("category"), pinfo.get("source"), pinfo.get("step"), sizeof_fmt(mem_req), sizeof_fmt(hub_mem),
                      sizeof_fmt(max_mem), timesofar(t0)))
             yield from asyncio.sleep(sleep_time)
             waited = True
             # refresh limites and usage (manager can be modified from hub
             # thus memory usage can be modified on-the-fly
             hub_mem = self.hub_memory
             max_mem = self.max_memory_usage and self.max_memory_usage or self.avail_memory
     pendings = len(self.process_queue._pending_work_items.keys()
                    ) - config.HUB_MAX_WORKERS
     while pendings >= config.MAX_QUEUED_JOBS:
         if not waited:
             logger.info("Can't run job {cat:%s,source:%s,step:%s} right now, too much pending jobs in the queue (max: %s), will retry until possible" % \
                     (pinfo.get("category"), pinfo.get("source"), pinfo.get("step"), config.MAX_QUEUED_JOBS))
         yield from asyncio.sleep(sleep_time)
         pendings = len(self.process_queue._pending_work_items.keys()
                        ) - config.HUB_MAX_WORKERS
         waited = True
     if waited:
         logger.info(
             "Job {cat:%s,source:%s,step:%s} now can be launched (total waiting time: %s)"
             % (pinfo.get("category"), pinfo.get("source"),
                pinfo.get("step"), timesofar(t0)))
Exemplo n.º 16
0
 def register_status(self, status, transient=False, **extra):
     try:
         # is status is "failed" and depending on where it failed,
         # we may not be able to get the new_data_folder (if dumper didn't reach
         # the release information for instance). Default to current if failing
         data_folder = self.new_data_folder
     except DumperException:
         data_folder = self.current_data_folder
     self.src_doc = {
         '_id': self.src_name,
         'data_folder': data_folder,
         'release': getattr(self, self.__class__.SUFFIX_ATTR),
         'download': {
             'logfile': self.logfile,
             'started_at': datetime.now(),
             'status': status
         }
     }
     # only register time when it's a final state
     if transient:
         self.src_doc["download"]["pid"] = os.getpid()
     else:
         self.src_doc["download"]["time"] = timesofar(self.t0)
     if "download" in extra:
         self.src_doc["download"].update(extra["download"])
     else:
         self.src_doc.update(extra)
     self.src_dump.save(self.src_doc)
Exemplo n.º 17
0
def load_broadinstitute_exac():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    t0 = time.time()
    exacs = load_broadinstitute_exac_all()
    for k, v in load_broadinstitute_exac_nontcga().items():
        try:
            exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"]
        except KeyError:
            exacs[k] = v
    for k, v in load_broadinstitute_exac_nonpsych().items():
        try:
            exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"]
        except KeyError:
            exacs[k] = v

    logging.info("Convert transcript ID to EntrezID")
    import dataload.sources.ensembl.ensembl_base as ensembl_base
    ensembl_parser = ensembl_base.EnsemblParser()
    ensembl_parser._load_ensembl2entrez_li()
    ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li,
                               0,
                               alwayslist=True)
    ensembl_dir = get_data_folder("ensembl")
    for line in tabfile_feeder(
            os.path.join(ensembl_dir, "gene_ensembl__translation__main.txt")):
        _, ensid, transid, _ = line
        if transid in exacs:
            data = exacs.pop(
                transid)  # pop so no-match means no data in the end
            for entrezid in ensembl2entrez.get(ensid, [ensid]):
                exacs[entrezid] = data

    load_done('[%d, %s]' % (len(exacs), timesofar(t0)))

    return exacs
Exemplo n.º 18
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    download_list = get_file_list_for_download()
    if len(download_list) == 0:
        logging.info("No newer file found. Abort now.")
        sys.exit(0)

    doc = src_dump.find_one({'_id': 'ucsc'})
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)

    logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log')
    setup_logfile(logfile)

    # mark the download starts
    doc = {'_id': 'ucsc',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'lastmodified': latest_lastmodified,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(download_list, no_confirm)
    # mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
Exemplo n.º 19
0
def load_x(idx, fieldname, cvt_fn=None):
    '''idx is 0-based column number'''
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE, header=1, assert_column_no=VALID_COLUMN_NO):
        ld = listitems(ld, *(2, 19, idx))    # GeneID Ensembl(Gene) target_value
        for value in dupline_seperator(dupline=ld,
                                       dup_sep='; '):
            xli.append(value)

    ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0] != '' and x[1] != '']), 0, alwayslist=True)
    xli2 = []
    for entrez_id, ensembl_id, x_value in xli:
        if x_value:
            if cvt_fn:
                x_value = cvt_fn(x_value)
            if entrez_id:
                xli2.append((entrez_id, x_value))
            elif ensembl_id:
                entrez_id = ensembl2geneid.get(ensembl_id, None)
                if entrez_id:
                    for _eid in entrez_id:
                        xli2.append((_eid, x_value))
                else:
                    xli2.append((ensembl_id, x_value))

    gene2x = list2dict(list_nondup(xli2), 0)
    fn = lambda value: {fieldname: sorted(value) if isinstance(value, list) else value}
    gene2x = value_convert(gene2x, fn, traverse_list=False)
    load_done('[%d, %s]' % (len(gene2x), timesofar(t0)))

    return gene2x
Exemplo n.º 20
0
 def extract_worker_info(self, worker):
     info = OrderedDict()
     proc = worker.get("process")
     err = worker.get("err") and " !" or ""
     info["pid"] = str(worker["info"]["id"]) + err
     info["source"] = norm(worker["info"].get("source") or "", 25)
     info["category"] = norm(worker["info"].get("category") or "", 10)
     info["step"] = norm(worker["info"].get("step") or "", 20)
     info["description"] = norm(worker["info"].get("description") or "", 30)
     info["mem"] = proc and sizeof_fmt(proc.memory_info().rss)
     info["cpu"] = proc and "%.1f%%" % proc.cpu_percent()
     info["started_at"] = worker.get("started_at") or ""
     if worker.get("duration"):
         info["duration"] = worker["duration"]
     else:
         info["duration"] = timesofar(worker.get("started_at", 0))
     info["files"] = []
     if proc:
         for pfile in proc.open_files():
             # skip 'a' (logger)
             if pfile.mode == 'r':
                 finfo = OrderedDict()
                 finfo["path"] = pfile.path
                 finfo["read"] = sizeof_fmt(pfile.position)
                 size = os.path.getsize(pfile.path)
                 finfo["size"] = sizeof_fmt(size)
                 info["files"].append(finfo)
     return info
Exemplo n.º 21
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    download_list = get_file_list_for_download()
    if len(download_list) == 0:
        logging.info("No newer file found. Abort now.")
        sys.exit(0)

    doc = src_dump.find_one({'_id': 'ucsc'})
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)

    logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log')
    setup_logfile(logfile)

    # mark the download starts
    doc = {
        '_id': 'ucsc',
        'timestamp': timestamp,
        'data_folder': DATA_FOLDER,
        'lastmodified': latest_lastmodified,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()
    download(download_list, no_confirm)
    # mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True  # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
Exemplo n.º 22
0
    def register_status(self,status,**extra):
        """
        Register step status, ie. status for a sub-resource
        """
        upload_info = {"status" : status}
        upload_info.update(extra)
        job_key = "upload.jobs.%s" % self.name

        if status == "uploading":
            # record some "in-progress" information
            upload_info['step'] = self.name # this is the actual collection name
            upload_info['temp_collection'] = self.temp_collection_name
            upload_info['pid'] = os.getpid()
            upload_info['logfile'] = self.logfile
            upload_info['started_at'] = datetime.datetime.now()
            self.src_dump.update_one({"_id":self.main_source},{"$set" : {job_key : upload_info}})
        else:
            # only register time when it's a final state
            # also, keep previous uploading information
            upd = {}
            for k,v in upload_info.items():
                upd["%s.%s" % (job_key,k)] = v
            t1 = round(time.time() - self.t0, 0)
            upd["%s.status" % job_key] = status
            upd["%s.time" % job_key] = timesofar(self.t0)
            upd["%s.time_in_s" % job_key] = t1
            upd["%s.step" % job_key] = self.name # collection name
            self.src_dump.update_one({"_id" : self.main_source},{"$set" : upd})
Exemplo n.º 23
0
def load_broadinstitute_exac():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    t0 = time.time()
    exacs = load_broadinstitute_exac_all()
    for k,v in load_broadinstitute_exac_nontcga().items():
        try:
            exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"]
        except KeyError:
            exacs[k] = v
    for k,v in load_broadinstitute_exac_nonpsych().items():
        try:
            exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"]
        except KeyError:
            exacs[k] = v

    logging.info("Convert transcript ID to EntrezID")
    import dataload.sources.ensembl.ensembl_base as ensembl_base
    ensembl_parser = ensembl_base.EnsemblParser()
    ensembl_parser._load_ensembl2entrez_li()
    ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True)
    ensembl_dir = get_data_folder("ensembl")  
    for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")):
        _,ensid,transid,_ = line
        if transid in exacs:
            data = exacs.pop(transid) # pop so no-match means no data in the end
            for entrezid in ensembl2entrez.get(ensid,[ensid]):
                exacs[entrezid] = data

    load_done('[%d, %s]' % (len(exacs), timesofar(t0)))

    return exacs
Exemplo n.º 24
0
 def extract_worker_info(self, worker):
     info = OrderedDict()
     proc = worker.get("process", worker)
     err = worker.get("err") and " !" or ""
     info["pid"] = str(worker["job"]["id"]) + err
     info["source"] = norm(worker["job"].get("source") or "", 25)
     info["category"] = norm(worker["job"].get("category") or "", 10)
     info["step"] = norm(worker["job"].get("step") or "", 20)
     info["description"] = norm(worker["job"].get("description") or "", 30)
     info["mem"] = sizeof_fmt(proc.get("memory", {}).get("size", 0.0))
     info["cpu"] = "%.1f%%" % proc.get("cpu", {}).get("percent", 0.0)
     info["started_at"] = worker["job"]["started_at"]
     if worker.get("duration"):
         info["duration"] = worker["duration"]
     else:
         info["duration"] = timesofar(worker["job"]["started_at"])
     # for now, don't display files used by the process
     info["files"] = []
     #if proc:
     #    for pfile in proc.open_files():
     #        # skip 'a' (logger)
     #        if pfile.mode == 'r':
     #            finfo = OrderedDict()
     #            finfo["path"] = pfile.path
     #            finfo["read"] = sizeof_fmt(pfile.position)
     #            size = os.path.getsize(pfile.path)
     #            finfo["size"] = sizeof_fmt(size)
     #            #info["files"].append(finfo)
     return info
Exemplo n.º 25
0
def run_jobs_on_ipythoncluster(worker, task_list, shutdown_ipengines_after_done=False):

    t0 = time.time()
    rc = Client(CLUSTER_CLIENT_JSON)
    lview = rc.load_balanced_view()
    cnt_nodes = len(lview.targets or rc.ids)
    print("\t# nodes in use: {}".format(cnt_nodes))
    lview.block = False

    print("\t# of tasks: {}".format(len(task_list)))
    print("\tsubmitting...", end='')
    job = lview.map_async(worker,task_list)
    print("done.")
    try:
        job.wait_interactive()
    except KeyboardInterrupt:
        #handle "Ctrl-C"
        if ask("\nAbort all submitted jobs?") == 'Y':
            lview.abort()
            print("Aborted, all submitted jobs are cancelled.")
        else:
            print("Aborted, but your jobs are still running on the cluster.")
        return

    if len(job.result()) != len(task_list):
        print("WARNING:\t# of results returned ({}) != # of tasks ({}).".format(len(job.result()), len(task_list)))
    print("\ttotal time: {}".format(timesofar(t0)))

    if shutdown_ipengines_after_done:
        print("\tshuting down all ipengine nodes...", end='')
        lview.shutdown()
        print('Done.')
    return job.result()
Exemplo n.º 26
0
    def handle_src_build(self):

        #cleanup src and target collections
        src_clean_archives(noconfirm=True)
        target_clean_collections(noconfirm=True)

        for config in ('mygene', 'mygene_allspecies'):
            t0 = time.time()
            p = Popen(['python', '-m', 'databuild.builder', config],
                      cwd=src_path)
            returncode = p.wait()
            t = timesofar(t0)
            if returncode == 0:
                msg = 'Dispatcher:  "{}" builder finished successfully with code {} (time: {})'.format(
                    config, returncode, t)
                color = "green"
            else:
                msg = 'Dispatcher:  "{}" builder failed successfully with code {} (time: {})'.format(
                    config, returncode, t)
                color = "red"
            print(msg)
            if hipchat_msg:
                msg += '<a href="{}/log/build/{}">build log</a>'.format(
                    DATA_WWW_ROOT_URL, config)
                hipchat_msg(msg, message_format='html', color=color)

            assert returncode == 0, "Subprocess failed. Check error above."
        genedoc_merged.send(self)
Exemplo n.º 27
0
    def get_thread_summary(self):
        running_tids = self.get_thread_files()
        tchildren = self.thread_queue._threads
        res = {}
        for child in tchildren:
            res[child.name] = {
                "is_alive": child.isAlive(),
                "is_daemon": child.isDaemon(),
            }

            if child.name in running_tids:
                # something is running on that child process
                worker = running_tids[child.name]
                res[child.name]["job"] = {
                    "started_at": worker["job"]["started_at"],
                    "duration": timesofar(worker["job"]["started_at"], 0),
                    "func_name": worker["func_name"],
                    "category": worker["job"]["category"],
                    "description": worker["job"]["description"],
                    "source": worker["job"]["source"],
                    "step": worker["job"]["step"],
                    "id": worker["job"]["id"],
                }

        return res
Exemplo n.º 28
0
    def apply_changes(self, changes, verify=True, noconfirm=False):
        if verify:
            self.pre_verify_changes(changes)

        if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'):
            print("Aborted.")
            return -1
        step = self.step
        _db = get_target_db()
        source_col = _db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocESBackend(self)
        _timestamp = changes['timestamp']

        def _add_docs(ids):
            i = 0
            for _ids in iter_n(ids, step):
                t1 = time.time()
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                    i += 1
                target.insert(_doc_li)
                print('\t{}\t{}'.format(i, timesofar(t1)))

        t0 = time.time()
        if changes['add']:
            print("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            _add_docs(changes['add'])
            print("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            print("Deleting {} discontinued docs...".format(len(changes['delete'])), end='')
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            print("done. [{}]".format(timesofar(t00)))
        if changes['update']:
            print("Updating {} existing docs...".format(len(changes['update'])))
            t00 = time.time()
            ids = [x['_id'] for x in changes['update']]
            _add_docs(ids)
            print("done. [{}]".format(timesofar(t00)))

        target.finalize()

        print("\n")
        print("Finished.", timesofar(t0))
Exemplo n.º 29
0
def update_index(changes, sync_src, sync_target, noconfirm=False):
    # changes['_add'] = changes['delete']
    # changes['_delete'] = changes['add']
    # changes['delete'] = changes['_delete']
    # changes['add'] = changes['_add']
    # del changes['_add']
    # del changes['_delete']

    print("\t{}\trecords will be added.".format(len(changes['add'])))
    print("\t{}\trecords will be deleted.".format(len(changes['delete'])))
    print("\t{}\trecords will be updated.".format(len(changes['update'])))

    print()
    print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name,
                                         sync_src.name))
    print('\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME,
                                           sync_target.name))

    if noconfirm or ask("Continue?") == 'Y':
        t00 = time.time()
        es_idxer = sync_target.target_esidxer

        if len(changes['add']) > 0:
            print("Adding {} new records...".format(len(changes['add'])))
            t0 = time.time()
            _q = {'_id': {'$in': changes['add']}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['delete']) > 0:
            print("Deleting {} old records...".format(len(changes['delete'])))
            t0 = time.time()
            es_idxer.delete_docs(changes['delete'])
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['update']) > 0:
            print("Updating {} existing records...".format(len(changes['update'])))
            t0 = time.time()
            ids = [d['_id'] for d in changes['update']]
            _q = {'_id': {'$in': ids}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))
        print('=' * 20)
        print('Finished. [{}]'.format(timesofar(t00)))
Exemplo n.º 30
0
def main():
    no_confirm = True  # set it to True for running this script automatically without intervention.

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                       DATA_FOLDER) == 'Y'):
            sys.exit()

    logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    src_dump = get_src_dump()
    doc = {
        '_id': 'entrez',
        'timestamp': timestamp,
        'data_folder': DATA_FOLDER,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()
    download(DATA_FOLDER, no_confirm=no_confirm)
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(DATA_FOLDER)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True  # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Exemplo n.º 31
0
    def _db_upload(self, doc_li, step=10000, verbose=True):
        import time
        from biothings.utils.common import timesofar
        from utils.dataload import list2dict, list_itemcnt, listsort

        output = []
        t0 = time.time()
        for i in range(0, len(doc_li), step):
            output.extend(self.target_db.update(doc_li[i:i+step]))
            if verbose:
                print('\t%d-%d Done [%s]...' % (i+1, min(i+step, len(doc_li)), timesofar(t0)))

        res = list2dict(list_itemcnt([x[0] for x in output]), 0)
        print("Done![%s, %d OK, %d Error]" % (timesofar(t0), res.get(True, 0), res.get(False, 0)))
        res = listsort(list_itemcnt([x[2].args[0] for x in output if x[0] is False]), 1, reverse=True)
        print('\n'.join(['\t%s\t%d' % x for x in res[:10]]))
        if len(res) > 10:
            print("\t%d lines omitted..." % (len(res)-10))
Exemplo n.º 32
0
def main(daemon=False):
    running_processes = {}
    while 1:
        src_to_update_li = check_mongo()
        if src_to_update_li:
            print('\nDispatcher:  found pending jobs ', src_to_update_li)
            for src_to_update in src_to_update_li:
                if src_to_update not in running_processes:
                    mark_upload_started(src_to_update)
                    p = dispatch(src_to_update)
                    src_dump.update({'_id': src_to_update}, {"$set": {"upload.pid": p.pid}})
                    p.t0 = time.time()
                    running_processes[src_to_update] = p

        jobs_finished = []
        if running_processes:
            print('Dispatcher:  {} active job(s)'.format(len(running_processes)))
            print(get_process_info(running_processes))

        for src in running_processes:
            p = running_processes[src]
            returncode = p.poll()
            if returncode is not None:
                t1 = round(time.time() - p.t0, 0)
                d = {'upload.returncode': returncode,
                     'upload.timestamp': datetime.now(),
                     'upload.time_in_s': t1,
                     'upload.time': timesofar(p.t0),
                     'upload.logfile': p.logfile,
                     }
                if returncode == 0:
                    print('Dispatcher:  {} finished successfully with code {} (time: {}s)'.format(src, returncode, t1))
                    d['upload.status'] = "success"
                else:
                    print('Dispatcher:  {} failed with code {} (time: {}s)'.format(src, returncode, t1))
                    d['upload.status'] = "failed"

                mark_upload_done(src, d)
                jobs_finished.append(src)
                p.log_f.close()
            else:
                p.log_f.flush()
        for src in jobs_finished:
            del running_processes[src]

        if running_processes:
            time.sleep(10)
        else:
            if daemon:
                #continue monitor src_dump collection
                print("{}".format('\b' * 50), end='')
                for i in range(100):
                    print('\b' * 2 + [chr(8212), '\\', '|', '/'][i % 4], end='')
                    time.sleep(0.1)
            else:
                break
Exemplo n.º 33
0
 def _add_docs(ids):
     i = 0
     for _ids in iter_n(ids, step):
         t1 = time.time()
         _doc_li = src.mget_from_ids(_ids)
         for _doc in _doc_li:
             _doc['_timestamp'] = _timestamp
             i += 1
         target.insert(_doc_li)
         print('\t{}\t{}'.format(i, timesofar(t1)))
Exemplo n.º 34
0
 def _add_docs(ids):
     i = 0
     for _ids in iter_n(ids, step):
         t1 = time.time()
         _doc_li = src.mget_from_ids(_ids)
         for _doc in _doc_li:
             _doc['_timestamp'] = _timestamp
             i += 1
         target.insert(_doc_li)
         print('\t{}\t{}'.format(i, timesofar(t1)))
Exemplo n.º 35
0
def main_cron(no_confirm=True):
    '''set no_confirm to True for running this script automatically
       without intervention.'''

    src_dump = get_src_dump()
    mart_version = chk_latest_mart_version()
    logging.info("Checking latest mart_version:\t%s" % mart_version)

    doc = src_dump.find_one({'_id': 'ensembl'})
    if doc and 'release' in doc and mart_version <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version)
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'ensembl',
           'release': mart_version,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        BM = BioMart()
        BM.species_li = get_all_species(mart_version)
        BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt'))
        BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt'))
        BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt'))

        BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt'))
        BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt'))
        BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt'))
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
Exemplo n.º 36
0
def main():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit()

    logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    src_dump = get_src_dump()
    doc = {'_id': 'entrez',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(DATA_FOLDER, no_confirm=no_confirm)
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(DATA_FOLDER)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Exemplo n.º 37
0
    def get_process_summary(self):
        running_pids = self.get_pid_files()
        res = {}
        for child in self.pchildren:
            try:
                mem = child.memory_info().rss
                pio = child.io_counters()
                # TODO: cpu as reported here isn't reliable, the only to get something
                # consistent to call cpu_percent() with a waiting time argument to integrate
                # CPU activity over this time, but this is a blocking call and freeze the hub
                # (an async implementation might possible though). Currently, pchildren is list
                # set at init time where process object are stored, so subsequent cpu_percent()
                # calls should report CPU activity since last call (between /job_manager & top()
                # calls), but it constently return CPU > 100% even when no thread running (that
                # could have been the explination but it's not).
                cpu = child.cpu_percent()
                res[child.pid] = {
                    "memory": {
                        "size": child.memory_info().rss,
                        "percent": child.memory_percent(),
                    },
                    "cpu": {
                        # override status() when we have cpu activity to avoid
                        # having a "sleeping" process that's actually running something
                        # (prob happening because delay between status and cpu_percent(), like a race condition)
                        "status": cpu > 0.0 and "running" or child.status(),
                        "percent": cpu
                    },
                    "io": {
                        "read_count": pio.read_count,
                        "write_count": pio.write_count,
                        "read_bytes": pio.read_bytes,
                        "write_bytes": pio.write_bytes
                    }
                }

                if child.pid in running_pids:
                    # something is running on that child process
                    worker = running_pids[child.pid]
                    res[child.pid]["job"] = {
                        "started_at": worker["job"]["started_at"],
                        "duration": timesofar(worker["job"]["started_at"], 0),
                        "func_name": worker["func_name"],
                        "category": worker["job"]["category"],
                        "description": worker["job"]["description"],
                        "source": worker["job"]["source"],
                        "step": worker["job"]["step"],
                        "id": worker["job"]["id"],
                    }
            except psutil.NoSuchProcess as e:
                print("child not found %s %s" % (child, e))
                continue

        return res
Exemplo n.º 38
0
    def apply_changes(self, changes):
        step = self.step
        target_col = self._target_col
        source_col = self._db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocMongoDBBackend(target_col)
        _timestamp = changes['timestamp']

        t0 = time.time()
        if changes['add']:
            logging.info("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            for _ids in iter_n(changes['add'], step):
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                target.insert(_doc_li)
            logging.info("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            logging.info("Deleting {} discontinued docs...".format(
                len(changes['delete'])))
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            logging.info("done. [{}]".format(timesofar(t00)))

        if changes['update']:
            logging.info("Updating {} existing docs...".format(
                len(changes['update'])))
            t00 = time.time()
            i = 0
            t1 = time.time()
            for _diff in changes['update']:
                target.update_diff(_diff, extra={'_timestamp': _timestamp})
                i += 1
                if i > 1 and i % step == 0:
                    logging.info('\t{}\t{}'.format(i, timesofar(t1)))
                    t1 = time.time()
            logging.info("done. [{}]".format(timesofar(t00)))
        logging.info("\n")
        logging.info("Finished. %s" % timesofar(t0))
Exemplo n.º 39
0
def main():
    if len(sys.argv) > 1:
        config = sys.argv[1]
    else:
        config = 'mygene_allspecies'
    use_parallel = '-p' in sys.argv
    noconfirm = '-b' in sys.argv
    if config == 'clean':
        clean_target_collection()
    else:
        t0 = time.time()
        build_index(config, use_parallel=use_parallel, noconfirm=noconfirm)
        print("Finished.", timesofar(t0))
Exemplo n.º 40
0
    def _finished(self, _doc, _job):
        doc = self._col.find_one({'_id': self._id})
        job = doc["jobs"][-1]

        t0 = job["step_started_at"].timestamp()
        job["time_in_s"] = round(time() - t0, 0)
        job["time"] = timesofar(t0)

        if self.regx:
            merge(doc, _doc)
        merge(job, _job)

        self._col.replace_one({"_id": self._id}, doc)
Exemplo n.º 41
0
def main():
    if len(sys.argv) > 1:
        config = sys.argv[1]
    else:
        config = 'mygene_allspecies'
    use_parallel = '-p' in sys.argv
    noconfirm = '-b' in sys.argv
    if config == 'clean':
        clean_target_collection()
    else:
        t0 = time.time()
        build_index(config, use_parallel=use_parallel, noconfirm=noconfirm)
        print("Finished.", timesofar(t0))
Exemplo n.º 42
0
    def apply_changes(self, changes):
        step = self.step
        target_col = self._target_col
        source_col = self._db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocMongoDBBackend(target_col)
        _timestamp = changes['timestamp']

        t0 = time.time()
        if changes['add']:
            logging.info("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            for _ids in iter_n(changes['add'], step):
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                target.insert(_doc_li)
            logging.info("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            logging.info("Deleting {} discontinued docs...".format(len(changes['delete'])))
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            logging.info("done. [{}]".format(timesofar(t00)))

        if changes['update']:
            logging.info("Updating {} existing docs...".format(len(changes['update'])))
            t00 = time.time()
            i = 0
            t1 = time.time()
            for _diff in changes['update']:
                target.update_diff(_diff, extra={'_timestamp': _timestamp})
                i += 1
                if i > 1 and i % step == 0:
                    logging.info('\t{}\t{}'.format(i, timesofar(t1)))
                    t1 = time.time()
            logging.info("done. [{}]".format(timesofar(t00)))
        logging.info("\n")
        logging.info("Finished. %s" % timesofar(t0))
Exemplo n.º 43
0
    def doc_feeder(self, step=10000, verbose=True, query=None, scroll='10m', **kwargs):
        q = query if query else {'query': {'match_all': {}}}
        _q_cnt = self.count(q=q, raw=True)
        n = _q_cnt['count']
        n_shards = _q_cnt['_shards']['total']
        assert n_shards == _q_cnt['_shards']['successful']
        _size = int(step / n_shards)
        assert _size * n_shards == step
        cnt = 0
        t0 = time.time()
        if verbose:
            print('\ttotal docs: {}'.format(n))
            t1 = time.time()

        res = self._es.search(self._index, self._doc_type, body=q,
                              size=_size, search_type='scan', scroll=scroll, **kwargs)
        # double check initial scroll request returns no hits
        assert len(res['hits']['hits']) == 0

        while 1:
            if verbose:
                t1 = time.time()
                if cnt < n:
                    print('\t{}-{}...'.format(cnt+1, min(cnt+step, n)), end='')
            res = self._es.scroll(res['_scroll_id'], scroll=scroll)
            if len(res['hits']['hits']) == 0:
                break
            else:
                for doc in res['hits']['hits']:
                    yield doc['_source']
                    cnt += 1
                if verbose:
                    print('done.[%.1f%%,%s]' % (min(cnt, n)*100./n, timesofar(t1)))

        if verbose:
            print("Finished! [{}]".format(timesofar(t0)))

        assert cnt == n, "Error: scroll query terminated early [{}, {}], please retry.\nLast response:\n{}".format(cnt, n, res)
Exemplo n.º 44
0
    def process(self, iterable, batch_size):
        self.logger.info("Uploading to the DB...")
        t0 = time.time()
        tinner = time.time()
        total = 0
        for doc_li in self.doc_iterator(iterable,
                                        batch=True,
                                        batch_size=batch_size):
            try:
                bob = self.temp_collection.initialize_unordered_bulk_op()
                for d in doc_li:
                    bob.find({"_id": d["_id"]}).upsert().replace_one(d)
                res = bob.execute()
                nb = res["nUpserted"] + res["nModified"]
                total += nb
                self.logger.info("Upserted %s records [%s]" %
                                 (nb, timesofar(tinner)))
            except Exception as e:
                raise
            tinner = time.time()
        self.logger.info('Done[%s]' % timesofar(t0))

        return total
Exemplo n.º 45
0
def parse_vcf(assembly,
              vcf_infile,
              compressed=True,
              verbose=True,
              by_id=True,
              **tabix_params):
    t0 = time.time()
    compressed == vcf_infile.endswith('.gz')
    vcf_r = Reader(filename=vcf_infile, compressed=compressed)
    vcf_r.fetch('1', 1)  # call a dummy fetch to initialize vcf_r._tabix
    if tabix_params:
        vcf_r.reader = vcf_r._tabix.fetch(**tabix_params)
    cnt_1, cnt_2, cnt_3 = 0, 0, 0
    for rec in vcf_r:
        doc = parse_one_rec(assembly, rec)
        if by_id:
            # one hgvs id, one doc
            if doc['_id']:
                if isinstance(doc['_id'], list):
                    for i, _id in enumerate(doc['_id']):
                        _doc = copy.copy(doc)
                        _doc['alt'] = doc['alt'][i]
                        _doc[assembly] = doc[assembly][i]
                        _doc['_id'] = _id
                        yield _doc
                        cnt_2 += 1
                        if verbose:
                            logging.info("%s\t%s" %
                                         (_doc['rsid'], _doc['_id']))

                else:
                    yield doc
                    cnt_2 += 1
                    if verbose:
                        logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
            else:
                cnt_3 += 1
        else:
            # one rsid, one doc
            if doc['_id']:
                yield doc
                cnt_2 += 1
                if verbose:
                    logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
            else:
                cnt_3 += 1
        cnt_1 += 1
    logging.info("Done. [{}]".format(timesofar(t0)))
    logging.info("Total rs: {}; total docs: {}; skipped rs: {}".format(
        cnt_1, cnt_2, cnt_3))
Exemplo n.º 46
0
    def process(self, doc_d, batch_size):
        self.logger.info("Uploading to the DB...")
        t0 = time.time()
        total = 0
        for doc_li in self.doc_iterator(doc_d,
                                        batch=True,
                                        batch_size=batch_size):
            self.temp_collection.insert(doc_li,
                                        manipulate=False,
                                        check_keys=False)
            total += len(doc_li)
        self.logger.info('Done[%s]' % timesofar(t0))

        return total
Exemplo n.º 47
0
    def _db_upload(self, doc_li, step=10000, verbose=True):
        import time
        from biothings.utils.common import timesofar
        from utils.dataload import list2dict, list_itemcnt, listsort

        output = []
        t0 = time.time()
        for i in range(0, len(doc_li), step):
            output.extend(self.target_db.update(doc_li[i:i + step]))
            if verbose:
                print('\t%d-%d Done [%s]...' %
                      (i + 1, min(i + step, len(doc_li)), timesofar(t0)))

        res = list2dict(list_itemcnt([x[0] for x in output]), 0)
        print("Done![%s, %d OK, %d Error]" %
              (timesofar(t0), res.get(True, 0), res.get(False, 0)))
        res = listsort(list_itemcnt(
            [x[2].args[0] for x in output if x[0] is False]),
                       1,
                       reverse=True)
        print('\n'.join(['\t%s\t%d' % x for x in res[:10]]))
        if len(res) > 10:
            print("\t%d lines omitted..." % (len(res) - 10))
Exemplo n.º 48
0
def load_exons_for_species(species, exons_key='exons'):
    refflat_file = os.path.join(DATA_FOLDER, species,
                                'database/refFlat.txt.gz')

    load_start(refflat_file)
    t0 = time.time()

    ref2exons = {}
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = list(
            zip([int(x) for x in ld[9].split(',') if x],
                [int(x) for x in ld[10].split(',') if x]))
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.setdefault(refseq, []).append({
            'transcript':
            refseq,
            'chr':
            chr,
            'strand':
            -1 if ld[3] == '-' else 1,
            'txstart':
            int(ld[4]),
            'txend':
            int(ld[5]),
            'cdsstart':
            int(ld[6]),
            'cdsend':
            int(ld[7]),
            'position':
            exons
        })

    gene2exons = {}
    for refseq in sorted(ref2exons.keys()):
        geneid = refseq2gene.get(refseq, None)
        if geneid and geneid != '0':
            if geneid not in gene2exons:
                gene2exons[geneid] = {exons_key: ref2exons[refseq]}
            else:
                gene2exons[geneid][exons_key].extend(ref2exons[refseq])

    load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))

    return gene2exons
Exemplo n.º 49
0
 def backup_timestamp(self, outfile=None, compress=True, ):
     '''backup "_id" and "_timestamp" fields into a output file.'''
     ts = time.strftime('%Y%m%d')
     outfile = outfile or self._target_col.name + '_tsbk_' + ts + '.txt'
     if compress:
         outfile += '.bz'
         import bz2
     logging.info('Backing up timestamps into "{}"...'.format(outfile))
     t0 = time.time()
     file_handler = bz2.BZ2File if compress else open
     with file_handler(outfile, 'wb') as out_f:
         for doc in doc_feeder(self._target_col, step=100000, fields=['_timestamp']):
             data = '%s\t%s\n' % (doc['_id'], doc['_timestamp'].strftime('%Y%m%d'))
             out_f.write(data.encode())
     logging.info("Done. %s" % timesofar(t0))
     return outfile
Exemplo n.º 50
0
def doc_feeder0(collection, step=1000, s=None, e=None, inbatch=False):
    '''A iterator for returning docs in a collection, with batch query.'''
    n = collection.count()
    s = s or 1
    e = e or n
    print('Found %d documents in database "%s".' % (n, collection.name))
    for i in range(s - 1, e + 1, step):
        print("Processing %d-%d documents..." % (i + 1, i + step), end='')
        t0 = time.time()
        res = collection.find(skip=i, limit=step, timeout=False)
        if inbatch:
            yield res
        else:
            for doc in res:
                yield doc
        print('Done.[%s]' % timesofar(t0))
Exemplo n.º 51
0
def main_cron():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    logging.info("Checking latest refseq release:\t", end='')
    refseq_release = get_refseq_release()
    logging.info(refseq_release)

    src_dump = get_src_dump()
    doc = src_dump.find_one({'_id': 'refseq'})
    if doc and 'release' in doc and refseq_release <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'refseq_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'refseq',
           'release': refseq_release,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        download(DATA_FOLDER, refseq_release, no_confirm=no_confirm)
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'refseq'}, {'$set': _updates})
Exemplo n.º 52
0
def load_cpdb(__metadata__):
    # only import pathways from these sources
    PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included']
    VALID_COLUMN_NO = 4

    t0 = time.time()
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATA_FILES = []
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab'))

    _out = []
    for DATA_FILE in DATA_FILES:
        load_start(DATA_FILE)
        for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO):
            p_name, p_id, p_source = ld[:3]
            p_source = p_source.lower()
            if p_source == 'kegg' and p_id.startswith('path:'):
                p_id = p_id[5:]
            if p_source in PATHWAY_SOURCES_INCLUDED:
                genes = ld[-1].split(",")
                for gene in genes:
                    _out.append((gene, p_name, p_id, p_source))
        load_done()
    _out = list2dict(_out, 0, alwayslist=True)

    def _inner_cvt(p):
        p_name, p_id = p
        _d = {'name': p_name}
        if p_id != 'None':
            _d['id'] = p_id
        return _d

    def _cvt(pli):
        _d = list2dict(pli, 2)
        _d = value_convert(_d, _inner_cvt)
        for p_source in _d:
            if isinstance(_d[p_source], list):
                _d[p_source].sort()
        return {'pathway': _d}

    _out = dict_convert(_out, valuefn=_cvt)
    load_done('[%d, %s]' % (len(_out), timesofar(t0)))

    return _out
Exemplo n.º 53
0
def load_ucsc_exons():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    species_li = os.listdir(DATA_FOLDER)
    print("Found {} species folders.".format(len(species_li)))
    t0 = time.time()
    gene2exons = {}
    for species in species_li:
        print(species, end='...')
        if species == 'Homo_sapiens':
            gene2exons.update(load_exons_for_human())
        elif species == 'Mus_musculus':
            gene2exons.update(load_exons_for_mouse())
        else:
            gene2exons.update(load_exons_for_species(species))

    load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))

    return gene2exons
Exemplo n.º 54
0
def parse_vcf(assembly, vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params):
    t0 = time.time()
    compressed == vcf_infile.endswith('.gz')
    vcf_r = Reader(filename=vcf_infile, compressed=compressed)
    vcf_r.fetch('1', 1)   # call a dummy fetch to initialize vcf_r._tabix
    if tabix_params:
        vcf_r.reader = vcf_r._tabix.fetch(**tabix_params)
    cnt_1, cnt_2, cnt_3 = 0, 0, 0
    for rec in vcf_r:
        doc = parse_one_rec(assembly, rec)
        if by_id:
            # one hgvs id, one doc
            if doc['_id']:
                if isinstance(doc['_id'], list):
                    for i, _id in enumerate(doc['_id']):
                        _doc = copy.copy(doc)
                        _doc['alt'] = doc['alt'][i]
                        _doc[assembly] = doc[assembly][i]
                        _doc['_id'] = _id
                        yield _doc
                        cnt_2 += 1
                        if verbose:
                            logging.info("%s\t%s" % (_doc['rsid'], _doc['_id']))

                else:
                    yield doc
                    cnt_2 += 1
                    if verbose:
                        logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
            else:
                cnt_3 += 1
        else:
            # one rsid, one doc
            if doc['_id']:
                yield doc
                cnt_2 += 1
                if verbose:
                    logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
            else:
                cnt_3 += 1
        cnt_1 += 1
    logging.info("Done. [{}]".format(timesofar(t0)))
    logging.info("Total rs: {}; total docs: {}; skipped rs: {}".format(cnt_1, cnt_2, cnt_3))
Exemplo n.º 55
0
def update_from_temp_collections(config,no_confirm=False,use_parallel=False):
    t0 = time.time()
    sc = GeneDocSyncer(config)
    new_src_li = sc.get_new_source_list()
    if not new_src_li:
        logging.info("No new source collections need to update. Abort now.")
        return

    logging.info("Found {} new source collections need to update:".format(len(new_src_li)))
    logging.info("\n".join(['\t' + x for x in new_src_li]))

    if no_confirm or ask('Continue?') == 'Y':
        logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d'))
        logfile = os.path.join(LOG_FOLDER, logfile)
        setup_logfile(logfile)

        for src in new_src_li:
            t0 = time.time()
            logging.info("Current source collection: %s" % src)
            ts = _get_timestamp(src, as_str=True)
            logging.info("Calculating changes... ")
            changes = sc.get_changes(src, use_parallel=use_parallel)
            logging.info("Done")
            get_changes_stats(changes)
            if no_confirm or ask("Continue to save changes...") == 'Y':
                if config == 'genedoc_mygene':
                    dumpfile = 'changes_{}.pyobj'.format(ts)
                else:
                    dumpfile = 'changes_{}_allspecies.pyobj'.format(ts)
                dump(changes, dumpfile)
                dumpfile_key = 'genedoc_changes/' + dumpfile
                logging.info('Saving to S3: "{}"... '.format(dumpfile_key))
                send_s3_file(dumpfile, dumpfile_key)
                logging.info('Done.')
                #os.remove(dumpfile)

            if no_confirm or ask("Continue to apply changes...") == 'Y':
                sc.apply_changes(changes)
                sc.verify_changes(changes)
            logging.info('=' * 20)
            logging.info("Finished. %s" % timesofar(t0))
Exemplo n.º 56
0
def main():
    if len(sys.argv) > 1:
        config = sys.argv[1]
    else:
        config = 'mygene_allspecies'
    use_parallel = '-p' in sys.argv
    sources = None  # will build all sources
    target = None   # will generate a new collection name
    # "target_col:src_col1,src_col2" will specifically merge src_col1
    # and src_col2 into existing target_col (instead of merging everything)
    if not use_parallel and len(sys.argv) > 2:
        target,tmp = sys.argv[2].split(":")
        sources = tmp.split(",")

    t0 = time.time()
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    bdr.using_ipython_cluster = use_parallel
    bdr.merge(sources=sources,target=target)

    logging.info("Finished. %s" % timesofar(t0))
Exemplo n.º 57
0
def load_exons_for_species(species, exons_key='exons'):
    refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz')

    load_start(refflat_file)
    t0 = time.time()

    ref2exons = {}
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = list(zip([int(x) for x in ld[9].split(',') if x],
                     [int(x) for x in ld[10].split(',') if x]))
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.setdefault(refseq,[]).append({
            'transcript' : refseq,
            'chr': chr,
            'strand': -1 if ld[3] == '-' else 1,
            'txstart': int(ld[4]),
            'txend': int(ld[5]),
            'cdsstart': int(ld[6]),
            'cdsend': int(ld[7]),
            'position': exons
        })

    gene2exons = {}
    for refseq in sorted(ref2exons.keys()):
        geneid = refseq2gene.get(refseq, None)
        if geneid and geneid != '0':
            if geneid not in gene2exons:
                gene2exons[geneid] = {exons_key: ref2exons[refseq]}
            else:
                gene2exons[geneid][exons_key].extend(ref2exons[refseq])

    load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))

    return gene2exons
Exemplo n.º 58
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    lastmodified = check_lastmodified()
    doc = src_dump.find_one({'_id': 'uniprot'})
    if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
        path, filename = os.path.split(DATAFILE_PATH)
        data_file = os.path.join(doc['data_folder'], filename)
        if os.path.exists(data_file):
            logging.info("No newer file found. Abort now.")
            sys.exit(0)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'uniprot_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'uniprot',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'lastmodified': lastmodified,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(no_confirm)
    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'uniprot'}, {'$set': _updates})
Exemplo n.º 59
0
    def merge(self, step=100000, restart_at=0,sources=None,target=None):
        t0 = time.time()
        self.validate_src_collections(sources)
        self.prepare_target(target_name=target)
        self.log_building_start()
        try:
            if self.using_ipython_cluster:
                if sources:
                    raise NotImplemented("merge speficic sources not supported when using parallel")
                self._merge_ipython_cluster(step=step)
            else:
                self._merge_local(step=step, restart_at=restart_at,src_collection_list=sources)

            if self.target.name == 'es':
                logging.info("Updating metadata...")
                self.update_mapping_meta()

            t1 = round(time.time() - t0, 0)
            t = timesofar(t0)
            self.log_src_build({'status': 'success',
                                'time': t,
                                'time_in_s': t1,
                                'timestamp': datetime.now()})

        finally:
            #do a simple validation here
            if getattr(self, '_stats', None):
                logging.info("Validating...")
                target_cnt = self.target.count()
                if target_cnt == self._stats['total_genes']:
                    logging.info("OK [total count={}]".format(target_cnt))
                else:
                    logging.info("Warning: total count of gene documents does not match [{}, should be {}]".format(target_cnt, self._stats['total_genes']))

            if self.merge_logging:
                sys.stdout.close()