Пример #1
0
 def load(self):
     if self.map is None:
         # this is a whole dict containing all entrez _id, wether it's a current or retired one.
         # it means most of the data has assoction with same _id as key and as value. It consumes memory
         # but it's a way to know the entrez perimeter (what entrez _ids exist and should be considered
         self.map = loadobj(
             ("entrez_gene__geneid_d.pyobj", self.db_provider()),
             mode='gridfs')
Пример #2
0
 def load_chr_data(self):
     self.logger.info("\tLoading chromosome data from '%s'..." % self.genome)
     try:
         self._chr_data = loadobj(self.genome)
     except Exception as e:
         self.logger.info(e)
         raise
     self.logger.info("Done.")
Пример #3
0
 def load_chr_data(self):
     self.logger.info("\tLoading chromosome data from '%s'..." % self.genome)
     try:
         self._chr_data = loadobj(self.genome)
     except Exception as e:
         self.logger.info(e)
         raise
     self.logger.info("Done.")
Пример #4
0
 def main(self, diff_filepath, merge_collection, field):
     diff = loadobj(diff_filepath)
     source_collection = diff['source']
     add_ids = diff['add']
     delete_ids = diff['delete']
     update_ids = [_doc['_id'] for _doc in diff['update']]
     self.add_update(source_collection, merge_collection, add_ids)
     self.add_update(source_collection, merge_collection, update_ids)
     self.delete(merge_collection, field, delete_ids)
Пример #5
0
 def main(self, diff_filepath, merge_collection, field):
     diff = loadobj(diff_filepath)
     source_collection = diff['source']
     add_ids = diff['add']
     delete_ids = diff['delete']
     update_ids = [_doc['_id'] for _doc in diff['update']]
     self.add_update(source_collection, merge_collection, add_ids)
     self.add_update(source_collection, merge_collection, update_ids)
     self.delete(merge_collection, field, delete_ids)
Пример #6
0
 def update_mapping():
     diffm = os.path.join(diff_folder, diff_mapping_file)
     ops = loadobj(diffm)
     mapping = indexer.get_mapping()
     # we should have the same doc type declared in the mapping
     mapping[doc_type]["properties"] = jsonpatch.apply_patch(
         mapping[doc_type]["properties"], ops)
     res = indexer.update_mapping(mapping)
     return res
Пример #7
0
def restore(db, archive, drop=False):
    """Restore database from given archive. If drop is True, then delete existing collections"""
    data = loadobj(archive)
    for colname in data:
        docs = data[colname]
        col = b[colname]
        if drop:
            # we don't have a drop command but we can remove all docs
            col.remove({})
        for doc in docs:
            col.save(doc)
Пример #8
0
        def analyze(diff_file, detailed):
            data = loadobj(diff_file)
            sources[data["source"]] = 1
            if detailed:
                # TODO: if self-contained, no db connection needed
                new_col = create_backend(metadata["new"]["backend"])
                old_col = create_backend(metadata["old"]["backend"])
            if len(adds) < max_reported_ids:
                if detailed:
                    # look for which root keys were added in new collection
                    for _id in data["add"]:
                        # selfcontained = dict for whole doc (see TODO above)
                        if type(_id) == dict:
                            _id = _id["_id"]
                        doc = new_col.get_from_id(_id)
                        rkeys = sorted(doc.keys())
                        adds["ids"].append([_id, rkeys])
                else:
                    if data["add"] and type(data["add"][0]) == dict:
                        adds["ids"].extend([d["_id"] for d in data["add"]])
                    else:
                        adds["ids"].extend(data["add"])
            adds["count"] += len(data["add"])
            if len(dels) < max_reported_ids:
                if detailed:
                    # look for which root keys were deleted in old collection
                    for _id in data["delete"]:
                        doc = old_col.get_from_id(_id)
                        rkeys = sorted(doc.keys())
                        dels["ids"].append([_id, rkeys])
                else:
                    dels["ids"].extend(data["delete"])
            dels["count"] += len(data["delete"])
            for up in data["update"]:
                for patch in up["patch"]:
                    update_details[patch["op"]].setdefault(
                        patch["path"], {
                            "count": 0,
                            "ids": []
                        })
                    if len(update_details[patch["op"]][patch["path"]]
                           ["ids"]) < max_reported_ids:
                        update_details[patch["op"]][
                            patch["path"]]["ids"].append(up["_id"])
                    update_details[patch["op"]][patch["path"]]["count"] += 1
            update_details["count"] += len(data["update"])

            assert len(
                sources
            ) == 1, "Should have one datasource from diff files, got: %s" % [
                s for s in sources
            ]
Пример #9
0
 def load(self):
     if self.map is None:
         self.retired2current.load()
         self.map = {}
         ensembl2entrez_li = loadobj(
             ("ensembl_gene__2entrezgene_list.pyobj", self.db_provider()),
             mode='gridfs')
         #filter out those deprecated entrez gene ids
         for ensembl_id, entrez_id in ensembl2entrez_li:
             entrez_id = int(entrez_id)
             if entrez_id in self.retired2current:
                 self.map[ensembl_id] = self.retired2current.translate(
                     entrez_id)
Пример #10
0
def restore(archive, drop=False):
    """Restore database from given archive. If drop is True, then delete existing collections"""
    data = loadobj(archive)
    # use src_dump collection which always exists to get the database object
    db = get_src_dump().database
    for colname in data:
        docs = data[colname]
        col = db[colname]
        if drop:
            # we don't have a drop command but we can remove all docs
            col.remove({})
        for doc in docs:
            col.save(doc)
Пример #11
0
    def load_data(self, data_folder):
        """
        Loads gene data from NCBI's refseq2gene.gz file.
        Parses it based on genomic position data and refseq status provided by the
        list of taxids from get_ref_microbe_taxids() as lookup table
        :return:
        """

        taxids_file = os.path.join(data_folder, "../ref_microbe_taxids.pyobj")
        datafile = os.path.join(data_folder, 'gene2refseq.gz')

        taxids = loadobj(taxids_file)
        taxid_set = set(taxids)

        def _includefn(ld):
            return ld[0] in taxid_set  # match taxid from taxid_set

        cols_included = [0, 1, 7, 9, 10, 11]  # 0-based col idx
        gene2genomic_pos_li = tab2list(datafile,
                                       cols_included,
                                       header=1,
                                       includefn=_includefn)
        count = 0
        last_id = None
        for gene in gene2genomic_pos_li:
            count += 1
            strand = 1 if gene[5] == '+' else -1
            _id = gene[1]

            mgi_dict = {
                '_id': _id,
                'genomic_pos': {
                    'entrezgene': _id,
                    'start': int(gene[3]),
                    'end': int(gene[4]),
                    'chr': gene[2],
                    'strand': strand
                }
            }
            if _id != last_id:
                # rows with dup _id will be skipped
                yield mgi_dict
            last_id = _id
Пример #12
0
 def mapping_diffed(f):
     res = f.result()
     if res.get("mapping_file"):
         nonlocal got_error
         # check mapping differences: only "add" ops are allowed, as any others actions would be
         # ingored by ES once applied (you can't update/delete elements of an existing mapping)
         mf = os.path.join(diff_folder, res["mapping_file"]["name"])
         ops = loadobj(mf)
         for op in ops:
             if op["op"] != "add":
                 err = DifferException("Found diff operation '%s' in mapping file, " % op["op"] + \
                     " only 'add' operations are allowed. You can still produce the " + \
                     "diff by removing 'mapping' from 'steps' arguments. " + \
                     "Ex: steps=['count','content']. Diff operation was: %s" % op)
                 got_error = err
         metadata["diff"]["mapping_file"] = mf
         diff_stats["mapping_changed"] = True
     self.logger.info(
         "Diff file containing mapping differences generated: %s" %
         res.get("mapping_file"))
Пример #13
0
def load_genedoc(self):
    """
    Loads gene data from NCBI's refseq2gene.gz file.
    Parses it based on genomic position data and refseq status provided by the
    list of taxids from get_ref_microbe_taxids() as lookup table
    :return:
    """
    taxids = loadobj(TAXIDS_FILE)
    taxid_set = set(taxids)
    load_start(DATAFILE)

    def _includefn(ld):
        return ld[0] in taxid_set  # match taxid from taxid_set

    cols_included = [0, 1, 7, 9, 10, 11]  # 0-based col idx
    gene2genomic_pos_li = tab2list(DATAFILE,
                                   cols_included,
                                   header=1,
                                   includefn=_includefn)
    count = 0
    last_id = None
    for gene in gene2genomic_pos_li:
        count += 1
        strand = 1 if gene[5] == '+' else -1
        _id = gene[1]

        mgi_dict = {
            '_id': _id,
            'genomic_pos': {
                'start': int(gene[3]),
                'end': int(gene[4]),
                'chr': gene[2],
                'strand': strand
            }
        }
        if _id != last_id:
            # rows with dup _id will be skipped
            yield mgi_dict
        last_id = _id

    load_done('[%d]' % count)
Пример #14
0
 def reset_synced(self, diff_folder, backend=None):
     """
     Remove "synced" flag from any pyobj file in diff_folder
     """
     diff_files = glob.glob(os.path.join(diff_folder, "*.pyobj"))
     for diff in diff_files:
         pyobj = loadobj(diff)
         try:
             if pyobj.get("synced"):
                 if backend:
                     self.logger.info(
                         "Removing synced flag from '%s' for backend '%s'" %
                         (diff, backend))
                     pyobj["synced"].pop(backend, None)
                 else:
                     self.logger.info("Removing synced flag from '%s'" %
                                      diff)
                     pyobj.pop("synced")
                 dump(pyobj, diff)
         except AttributeError:
             # pyobj not a dict
             continue
Пример #15
0
def load_genedoc(self):
    """
    Loads gene data from NCBI's refseq2gene.gz file.
    Parses it based on genomic position data and refseq status provided by the
    list of taxids from get_ref_microbe_taxids() as lookup table
    :return:
    """
    taxids = loadobj(TAXIDS_FILE)
    taxid_set = set(taxids)
    load_start(DATAFILE)

    def _includefn(ld):
        return ld[0] in taxid_set  # match taxid from taxid_set

    cols_included = [0, 1, 7, 9, 10, 11]  # 0-based col idx
    gene2genomic_pos_li = tab2list(DATAFILE, cols_included, header=1,
                                   includefn=_includefn)
    count = 0
    last_id = None
    for gene in gene2genomic_pos_li:
        count += 1
        strand = 1 if gene[5] == '+' else -1
        _id = gene[1]

        mgi_dict = {
            '_id': _id,
            'genomic_pos': {
                'start': int(gene[3]),
                'end': int(gene[4]),
                'chr': gene[2],
                'strand': strand
            }
        }
        if _id != last_id:
            # rows with dup _id will be skipped
            yield mgi_dict
        last_id = _id

    load_done('[%d]' % count)
Пример #16
0
def sync_es_jsondiff_worker(diff_file,
                            es_config,
                            new_db_col_names,
                            batch_size,
                            cnt,
                            force=False,
                            selfcontained=False,
                            metadata={}):
    """Worker to sync data between a new mongo collection and an elasticsearch index"""
    new = create_backend(new_db_col_names)  # mongo collection to sync from
    indexer = create_backend(es_config).target_esidxer
    diff = loadobj(diff_file)
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    if not force and diff.get("synced", {}).get("es") == True:
        logging.info("Diff file '%s' already synced, skip it" % diff_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    errors = []
    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, no mongo needed
        cur = diff["add"]
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
    for docs in iter_n(cur, batch_size):
        try:
            res["added"] += indexer.index_bulk(docs,
                                               batch_size,
                                               action="create")[0]
        except BulkIndexError:
            for doc in docs:
                try:
                    # force action=create to spot docs already added
                    indexer.index(doc, doc["_id"], action="create")
                    res["added"] += 1
                except ConflictError:
                    # already added
                    res["skipped"] += 1
                    continue
                except Exception as e:
                    errors.append({
                        "_id": doc["_id"],
                        "file": diff_file,
                        "error": e
                    })
                    import pickle
                    pickle.dump(errors, open("errors", "wb"))
                    raise

    # update: get doc from indexer and apply diff
    batch = []
    ids = [p["_id"] for p in diff["update"]]
    for i, doc in enumerate(indexer.get_docs(ids)):
        try:
            patch_info = diff["update"][
                i]  # same order as what's return by get_doc()...
            assert patch_info["_id"] == doc["_id"]  # ... but just make sure
            newdoc = jsonpatch.apply_patch(doc, patch_info["patch"])
            if newdoc == doc:
                # already applied
                res["skipped"] += 1
                continue
            batch.append(newdoc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += indexer.index_bulk(batch, batch_size)[0]
            batch = []
    if batch:
        res["updated"] += indexer.index_bulk(batch, batch_size)[0]

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        del_skip = indexer.delete_docs(ids)
        res["deleted"] += del_skip[0]
        res["skipped"] += del_skip[1]

    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    diff.setdefault("synced", {}).setdefault("es", True)
    dump(diff, diff_file)
    return res
Пример #17
0
def sync_es_coldhot_jsondiff_worker(diff_file,
                                    es_config,
                                    new_db_col_names,
                                    batch_size,
                                    cnt,
                                    force=False,
                                    selfcontained=False,
                                    metadata={},
                                    debug=False):
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it" %
                     os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    eskwargs = {}
    # pass optional ES Indexer args
    if hasattr(btconfig, "ES_TIMEOUT"):
        eskwargs["timeout"] = btconfig.ES_TIMEOUT
    if hasattr(btconfig, "ES_MAX_RETRY"):
        eskwargs["max_retries"] = btconfig.ES_MAX_RETRY
    if hasattr(btconfig, "ES_RETRY"):
        eskwargs["retry_on_timeout"] = btconfig.ES_RETRY
    logging.debug("Create ES backend with args: (%s,%s)" %
                  (es_config, eskwargs))
    bckend = create_backend(es_config, **eskwargs)
    indexer = bckend.target_esidxer
    diff = loadobj(diff_file)

    # add: diff between hot collections showed we have new documents but it's
    # possible some of those docs already exist in premerge/cold collection.
    # if so, they should be treated as dict.update() where the hot document content
    # has precedence over the cold content for fields in common
    if selfcontained:
        # diff["add"] contains all documents, no mongo needed
        cur = diff["add"]
    else:
        new = create_backend(new_db_col_names)  # mongo collection to sync from
        assert new.target_collection.name == diff[
            "source"], "Source is different in diff file '%s': %s" % (
                diff_file, diff["source"])
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
    for docs in iter_n(cur, batch_size):
        # remove potenial existing _timestamp from document
        # (not allowed within an ES document (_source))
        [d.pop("_timestamp", None) for d in docs]
        # check which docs already exist in existing index (meaning they exist in cold collection)
        dids = dict([(d["_id"], d) for d in docs])
        dexistings = dict([
            (d["_id"], d) for d in indexer.get_docs([k for k in dids.keys()])
        ])
        logging.debug("From current batch, %d already exist" % len(dexistings))
        # remove existing docs from "add" so the rest of the dict will be treated
        # as "real" added documents while update existing ones with new content
        toremove = []
        for _id, d in dexistings.items():
            # update in-place
            if d == dids[d["_id"]]:
                logging.debug("%s was already added, skip it" % d["_id"])
                toremove.append(d["_id"])
                res["skipped"] += 1
            else:
                newd = copy.deepcopy(d)
                d.update(dids[d["_id"]])
                if d == newd:
                    logging.debug("%s was already updated, skip it" % d["_id"])
                    toremove.append(d["_id"])
                    res["skipped"] += 1
            dids.pop(d["_id"])
        for _id in toremove:
            dexistings.pop(_id)
        logging.info("Syncing 'add' documents (%s in total) from cold/hot merge: " % len(docs)
                     + "%d documents will be updated as they already exist in the index, " % len(dexistings)
                     + "%d documents will be added (%d skipped as already processed)" % (len(dids), len(toremove)))
        # treat real "added" documents
        # Note: no need to check for "already exists" errors, as we already checked that before
        # in order to know what to do
        try:
            res["added"] += indexer.index_bulk(dids.values(),
                                               batch_size,
                                               action="create")[0]
        except BulkIndexError:
            logging.error("Error while adding documents %s" %
                          [k for k in dids.keys()])
        # update already existing docs in cold collection
        # treat real "added" documents
        try:
            res["updated"] += indexer.index_bulk(dexistings.values(),
                                                 batch_size)[0]
        except BulkIndexError as e:
            logging.error(
                "Error while updating (via new hot detected docs) documents: %s"
                % e)

    # update: get doc from indexer and apply diff
    # note: it's the same process as for non-coldhot
    sync_es_for_update(diff_file, indexer, diff["update"], batch_size, res,
                       debug)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        del_skip = indexer.delete_docs(ids)
        res["deleted"] += del_skip[0]
        res["skipped"] += del_skip[1]

    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res
Пример #18
0
def sync_es_jsondiff_worker(diff_file,
                            es_config,
                            new_db_col_names,
                            batch_size,
                            cnt,
                            force=False,
                            selfcontained=False,
                            metadata={},
                            debug=False):
    """Worker to sync data between a new mongo collection and an elasticsearch index"""
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it" %
                     os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    eskwargs = {}
    # pass optional ES Indexer args
    if hasattr(btconfig, "ES_TIMEOUT"):
        eskwargs["timeout"] = btconfig.ES_TIMEOUT
    if hasattr(btconfig, "ES_MAX_RETRY"):
        eskwargs["max_retries"] = btconfig.ES_MAX_RETRY
    if hasattr(btconfig, "ES_RETRY"):
        eskwargs["retry_on_timeout"] = btconfig.ES_RETRY
    logging.debug("Create ES backend with args: (%s,%s)" %
                  (es_config, eskwargs))
    bckend = create_backend(es_config, **eskwargs)
    indexer = bckend.target_esidxer
    diff = loadobj(diff_file)
    errors = []
    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, no mongo needed
        cur = diff["add"]
    else:
        new = create_backend(new_db_col_names)  # mongo collection to sync from
        assert new.target_collection.name == diff[
            "source"], "Source is different in diff file '%s': %s" % (
                diff_file, diff["source"])
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
    for docs in iter_n(cur, batch_size):
        # remove potenial existing _timestamp from document
        # (not allowed within an ES document (_source))
        [d.pop("_timestamp", None) for d in docs]
        try:
            res["added"] += indexer.index_bulk(docs,
                                               batch_size,
                                               action="create")[0]
        except BulkIndexError:
            for doc in docs:
                _id = doc.pop("_id")
                try:
                    # force action=create to spot docs already added
                    indexer.index(doc, _id, action="create")
                    res["added"] += 1
                except ConflictError:
                    # already added
                    logging.warning("_id '%s' already added" % _id)
                    res["skipped"] += 1
                    continue
                except Exception as e:
                    errors.append({"_id": _id, "file": diff_file, "error": e})
                    import pickle
                    pickle.dump(errors, open("errors", "wb"))
                    raise
        except Exception as e:
            if debug:
                logging.error(
                    "From diff file '%s', following IDs couldn't be synced because: %s\n%s"
                    % (diff_file, e, [d.get("_id") for d in docs]))
                pickfile = "batch_%s_%s.pickle" % (cnt,
                                                   os.path.basename(diff_file))
                logging.error("Documents pickled in '%s'" % pickfile)
                pickle.dump(docs, open(pickfile, "wb"))
            raise

    # update: get doc from indexer and apply diff
    sync_es_for_update(diff_file, indexer, diff["update"], batch_size, res,
                       debug)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        del_skip = indexer.delete_docs(ids)
        res["deleted"] += del_skip[0]
        res["skipped"] += del_skip[1]

    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res
Пример #19
0
def sync_mongo_jsondiff_worker(diff_file,
                               old_db_col_names,
                               new_db_col_names,
                               batch_size,
                               cnt,
                               force=False,
                               selfcontained=False,
                               metadata={},
                               debug=False):
    """Worker to sync data between a new and an old mongo collection"""
    # check if diff files was already synced
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it" %
                     os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    new = create_backend(new_db_col_names)
    old = create_backend(old_db_col_names)
    storage = UpsertStorage(get_target_db(), old.target_collection.name,
                            logging)
    diff = loadobj(diff_file)
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, not mongo needed
        for docs in iter_n(diff["add"], batch_size):
            res["added"] += storage.process((d for d in docs), batch_size)
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
        for docs in iter_n(cur, batch_size):
            # use generator otherwise process/doc_iterator will require a dict (that's bad...)
            res["added"] += storage.process((d for d in docs), batch_size)

    # update: get doc from "old" and apply diff
    batch = []
    for patch_info in diff["update"]:
        doc = old.get_from_id(patch_info["_id"])
        try:
            doc = jsonpatch.apply_patch(doc, patch_info["patch"])
            batch.append(doc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += storage.process((d for d in batch), batch_size)
            batch = []
    if batch:
        res["updated"] += storage.process((d for d in batch), batch_size)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        res["deleted"] += old.remove_from_ids(ids)

    # we potentially modified the "old" collection so invalidate cache just to make sure
    invalidate_cache(old.target_collection.name, "target")
    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res
Пример #20
0
def get_geneid_d(data_folder,
                 species_li=None,
                 load_cache=True,
                 save_cache=True,
                 only_for={}):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may
       contain retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set(
            [TAXONOMY[species]["tax_id"] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(data_folder)

    # check cache file
    _cache_file = 'geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene_info.gz') and \
       file_newer(_cache_file, 'gene_history.gz'):
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(data_folder, 'gene_info.gz')
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set and (
            only_for and ld[1] in only_for)
    elif only_for:
        species_filter = lambda ld: only_for and ld[1] in only_for
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))

    DATAFILE = os.path.join(data_folder, 'gene_history.gz')

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li  # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2),
                            1,
                            alwayslist=0,
                            includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those
    # mapped_to geneid exists in gene_info list

    # convert key/value to int
    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)
    # TODO: this fills memory with key==value ...
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d
Пример #21
0
 def load_chr_data(self,genome_file):
     print("\tLoading chromosome data...", end='')
     self._chr_data = loadobj(genome_file)
     print("Done.")
Пример #22
0
 def load_chr_data(self, genome_file):
     print("\tLoading chromosome data...", end='')
     self._chr_data = loadobj(genome_file)
     print("Done.")
Пример #23
0
def load_pdb(data_folder):
    pdb_dumpfile = os.path.join(data_folder, 'gene2pdb.pyobj')
    data = loadobj(pdb_dumpfile)
    return data
Пример #24
0
def load_pir(data_folder):
    pir_dumpfile = os.path.join(data_folder, 'gene2pir.pyobj')
    data = loadobj(pir_dumpfile)
    return data
Пример #25
0
def load_data(step=1000, offset=0, gwas_data_local=None):
    if gwas_data_local:
        gwas_data = loadobj('gwasdata.pyobj')
        for item in gwas_data:
            snp = item
            chrom = snp[1]
            chrom = chrom[3:]
            rsid = snp[4]
            pubMedID = snp[5]
            title = snp[9]
            trait = snp[10]
            region = snp[13]
            gene_name = snp[14]
            riskAllele = snp[15]
            riskAlleleFreq = snp[16]
            if not is_float(riskAlleleFreq):
                riskAlleleFreq = None
            pValue = snp[17]
            pValue_desc = snp[18]
            if not is_float(pValue):
                pValue = None
                pValue_desc = None
            # parse from myvariant.info to get hgvs_id,
            # ref, alt information based on rsid
            url = 'http://localhost:8000/v1/query?q=dbsnp.rsid:'\
                + rsid + '&fields=_id,dbsnp.ref,dbsnp.alt,dbsnp.chrom,dbsnp.hg19'
            r = requests.get(url)
            for hits in r.json()['hits']:
                HGVS = hits['_id']

                one_snp_json = {
                    "_id": HGVS,
                    "gwassnp": {
                        "rsid": rsid,
                        "pubmed": pubMedID,
                        "title": title,
                        "trait": trait,
                        "region": region,
                        "genename": gene_name,
                        "risk_allele": riskAllele,
                        "risk_allele_freq": riskAlleleFreq,
                        "pvalue": pValue,
                        "pvalue_desc": pValue_desc
                    }
                }
                yield one_snp_json
    else:
        MySQLHG19 = MySQLdb.connect('genome-mysql.cse.ucsc.edu',
                                    db='hg19',
                                    user='******',
                                    passwd='password')
        Cursor = MySQLHG19.cursor()

        # get the row number of gwasCatalog
        sql = "SELECT COUNT(*) FROM gwasCatalog"
        Cursor.execute(sql)
        numrows = Cursor.fetchone()[0]
        print(numrows)

        sql = "SELECT * FROM gwasCatalog"
        Cursor.execute(sql)

        for i in range(numrows):
            snp = Cursor.fetchone()
            if i and i % step == 0:
                print(i)

            chrom = snp[1]
            chrom = chrom[3:]
            rsid = snp[4]
            pubMedID = snp[5]
            title = snp[9]
            trait = snp[10]
            region = snp[13]
            gene_name = snp[14]
            riskAllele = snp[15]
            riskAlleleFreq = snp[16]
            if not is_float(riskAlleleFreq):
                riskAlleleFreq = None
            pValue = snp[17]
            pValue_desc = snp[18]
            if not is_float(pValue):
                pValue = None
                pValue_desc = None
            # parse from myvariant.info to get hgvs_id, ref, alt information based on rsid
            url = 'http://localhost:8000/v1/query?q=dbsnp.rsid:'\
                + rsid + '&fields=_id,dbsnp.ref,dbsnp.alt,dbsnp.chrom,dbsnp.hg19'
            r = requests.get(url)
            for hits in r.json()['hits']:
                HGVS = hits['_id']
                one_snp_json = {
                    "_id": HGVS,
                    "gwassnp": {
                        "rsid": rsid,
                        "pubmed": pubMedID,
                        "title": title,
                        "trait": trait,
                        "region": region,
                        "genename": gene_name,
                        "risk_allele": riskAllele,
                        "risk_allele_freq": riskAlleleFreq,
                        "pvalue": pValue,
                        "pvalue_desc": pValue_desc
                    }
                }
                yield one_snp_json
Пример #26
0
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None):
    sync = ESSyncer(index=index)
    #sync._index = index
    #sync._esi._index = index
    diff = loadobj(diff_filepath)
    source_collection = diff['source']
    add_iter = sync.add(source_collection, diff['add'])
    delete_iter = sync.delete(collection, diff['delete'])
    update_iter = sync.update2(diff['update'], collection, source_collection)
    t00 = time()
    if save2file:
        from itertools import chain
        import json
        for op in chain(add_iter, delete_iter, update_iter):
            json.dump(op, save2file)
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        return

    print('Adding new {} docs...'.format(len(diff['add'])))
    t0 = time()
    if not dryrun:
        try:
            bulk(sync._es, add_iter)
        except:
            pass
    print("Done. [{}]".format(timesofar(t0)))

    print('Deleting {} docs'.format(len(diff['delete'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, delete_iter)
    print("Done. [{}]".format(timesofar(t0)))

    print('Updating {} docs'.format(len(diff['update'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, update_iter)
    print("Done. [{}]".format(timesofar(t0)))

    # add flush and refresh
    try:
        res = sync._es.indices.flush()
        print("Flushing...", res)
        res = sync._es.indices.refresh()
        print("Refreshing...", res)
    except:
        pass

    print("="*20)
    print("Finished! [{}]".format(timesofar(t00)))

    if returncnt:
        cnt = {
            'add': len(diff['add']),
            'delete': len(diff['delete']),
            'update': len(diff['update'])
        }
        return cnt

    if validate:
        print('Waiting {}s to let ES to finish...'.format(wait), end="")
        sleep(wait)
        print("Done.")
        print("Validating...")
        t0 = time()
        q = {
            "query": {
                "constant_score": {
                    "filter": {
                        "exists": {
                            "field": 'clinvar'
                        }
                    }
                }
            }
        }
        data = sync._esi.doc_feeder(query=q, _source=collection)
        temp_collection = collection + '_temp_' + get_random_string()
        sync._src[temp_collection].drop()
        load_source(temp_collection, src_data=data)
        c1 = get_backend(source_collection, 'mongodb')
        c2 = get_backend(temp_collection, 'mongodb')
        diff_result = diff_collections(c1, c2, use_parallel=False)
        sync._src[temp_collection].drop()
        print("Done. [{}]".format(t0))
        return diff_result