示例#1
0
    def __init__(self, G, *args, **kwargs):
        """
        The DataTransform MDB module was written as a decorator class
        which should be applied to the load_data function of a
        Biothings Uploader.  The load_data function yields documents,
        which are then post processed by call and the 'id' key
        conversion is performed.

        :param G: nx.DiGraph (networkx 2.1) configuration graph
        :param input_types: A list of input types for the form (identifier, field) where identifier matches a node and field is an optional dotstring field for where the identifier should be read from (the default is ‘_id’).
        :param output_types: A priority list of identifiers to convert to. These identifiers should match nodes in the graph.
        :type output_types: list(str)
        :param skip_on_failure: If True, documents where identifier conversion fails will be skipped in the final document list.
        :type skip_on_failure: bool
        :param skip_w_regex: Do not perform conversion if the identifier matches the regular expression provided to this argument. By default, this option is disabled.
        :type skip_w_regex: bool
        :param idstruct_class: Override an internal data structure used by the this module (advanced usage)
        :type idstruct_class: class
        :param copy_from_doc: If true then an identifier is copied from the input source document regardless as to weather it matches an edge or not. (advanced usage)
        :type copy_from_doc: bool
        """
        if not isinstance(G, nx.DiGraph):
            raise ValueError(
                "key_lookup configuration error:  G must be of type nx.DiGraph"
            )
        self._validate_graph(G)
        self.G = G
        self.logger, _ = get_logger('datatransform')

        super().__init__(*args, **kwargs)
        self._precompute_paths()
示例#2
0
    def __init__(self, managers, version_urls, indexer_factory=None, *args, **kwargs):
        """
        version_urls is a list of URLs pointing to versions.json file. The name
        of the data release is taken from the URL (http://...s3.amazon.com/<the_name>/versions.json)
        unless specified as a dict: {"name" : "custom_name", "url" : "http://..."}

        If indexer_factory is passed, it'll be used to create indexer used to dump/check versions
        currently installed on ES, restore snapshot, index, etc... A indexer_factory is typically
        used to generate indexer dynamically (ES host, index name, etc...) according to URLs for
        instance. See standalone.hub.DynamicIndexerFactory class for an example. It is typically
        used when lots of data releases are being managed by the Hub (so no need to manually update
        STANDALONE_CONFIG parameter.

        If indexer_factory is None, a config param named STANDALONE_CONFIG is used,
        format is the following:

            {"_default" : {"es_host": "...", "index": "...", "doc_type" : "..."},
             "the_name" : {"es_host": "...", "index": "...", "doc_type" : "..."}}

        When a data release named (from URL) matches an entry, it's used to configured
        which ES backend to target, otherwise the default one is used.
        """
        super().__init__(*args, **kwargs)
        self.version_urls = self.extract(version_urls)
        self.indexer_factory = indexer_factory
        self.managers = managers
        self.logger, _ = get_logger("autohub")
示例#3
0
def merger_worker(col_name, dest_name, ids, mapper, upsert, batch_num):
    try:
        src = mongo.get_src_db()
        tgt = mongo.get_target_db()
        col = src[col_name]
        #if batch_num == 2:
        #    raise ValueError("oula pa bon")
        dest = DocMongoBackend(tgt, tgt[dest_name])
        cur = doc_feeder(col,
                         step=len(ids),
                         inbatch=False,
                         query={'_id': {
                             '$in': ids
                         }})
        mapper.load()
        docs = mapper.process(cur)
        cnt = dest.update(docs, upsert=upsert)
        return cnt
    except Exception as e:
        logger_name = "build_%s_%s_batch_%s" % (dest_name, col_name, batch_num)
        logger = get_logger(logger_name, btconfig.LOG_FOLDER)
        logger.exception(e)
        exc_fn = os.path.join(btconfig.LOG_FOLDER, "%s.pick" % logger_name)
        pickle.dump(e, open(exc_fn, "wb"))
        logger.info("Exception was dumped in pickle file '%s'" % exc_fn)
        raise
示例#4
0
def indexer_worker(col_name,
                   ids,
                   pindexer,
                   batch_num,
                   mode="index",
                   worker=new_index_worker):
    try:
        if mode in ["index", "merge"]:
            return worker(col_name, ids, pindexer, batch_num)
        elif mode == "resume":
            idxr = pindexer()
            es_ids = idxr.mexists(ids)
            missing_ids = [e[0] for e in es_ids if e[1] is False]
            if missing_ids:
                return worker(col_name, missing_ids, pindexer, batch_num)
            else:
                # fake indexer results, it has to be a tuple, first elem is num of indexed docs
                return (0, None)
    except Exception as e:
        logger_name = "index_%s_%s_batch_%s" % (pindexer.keywords.get(
            "index", "index"), col_name, batch_num)
        logger, _ = get_logger(logger_name, btconfig.LOG_FOLDER)
        logger.exception("indexer_worker failed")
        exc_fn = os.path.join(btconfig.LOG_FOLDER, "%s.pick" % logger_name)
        pickle.dump({"exc": e, "ids": ids}, open(exc_fn, "wb"))
        logger.info("Exception and IDs were dumped in pickle file '%s'",
                    exc_fn)
        raise
示例#5
0
    def __init__(self,
                 input_types,
                 output_types,
                 id_priority_list=[],
                 skip_on_failure=False,
                 skip_w_regex=None,
                 skip_on_success=False,
                 idstruct_class=IDStruct,
                 copy_from_doc=False,
                 debug=False):
        # pylint: disable=R0913, W0102
        """
        Initialize the keylookup object and precompute paths from the
        start key to all target keys.

        The decorator is intended to be applied to the load_data function
        of an uploader.  The load_data function yields documents, which
        are then post processed by call and the 'id' key conversion is
        performed.

        :param G: nx.DiGraph (networkx 2.1) configuration graph
        :param collections: list of mongodb collection names
        :param input_type: key type to start key lookup from
        :param output_types: list of all output types to convert to
        :param id_priority_list: A priority list of identifiers to to
               sort input and output types by.
        :type id_priority_list: list(str)
        :param id_struct_class: IDStruct used to manager/fetch IDs from docs
        :param copy_from_doc: if transform failed using the graph, try to get
               value from the document itself when output_type == input_type.
               No check is performed, it's a straight copy. If checks are needed
               (eg. check that an ID referenced in the doc actually exists in
               another collection, nodes with self-loops can be used, so
               ID resolution will be forced to go through these loops to ensure
               data exists)
        """
        self.input_types = self._parse_input_types(input_types)
        self.output_types = self._parse_output_types(output_types)
        self.id_priority_list = id_priority_list

        self.skip_on_failure = skip_on_failure
        self.skip_on_success = skip_on_success

        if skip_w_regex and not isinstance(skip_w_regex, str):
            raise ValueError('skip_w_regex must be a string')
        elif not skip_w_regex:
            self.skip_w_regex = None
        else:
            self.skip_w_regex = re.compile(skip_w_regex)

        self.idstruct_class = idstruct_class
        self.copy_from_doc = copy_from_doc

        self.histogram = Histogram()
        # Setup logger and logging level
        self.logger, _ = get_logger('datatransform')

        self.debug = debug
示例#6
0
 def __init__(self, src_name, data_folder, load_ensembl2entrez=True):
     self.data_folder = data_folder
     self.ensembl2entrez_li = None
     self.ensembl_main = None
     if load_ensembl2entrez:
         self._load_ensembl2entrez_li(src_name)
         self.ensembl2entrez = list2dict(
             self.ensembl2entrez_li, 0, alwayslist=True)
     self.logger, self.logfile = get_logger("parse_%s" % src_name)
示例#7
0
    def __init__(self, index_name, snapshot_name, env_conf, build_doc):

        self.index_name = index_name
        self.snapshot_name = snapshot_name
        self.build_doc = build_doc
        self.env_conf = env_conf

        self.logger, self.logfile = get_logger(SNAPSHOOTER_CATEGORY,
                                               btconfig.LOG_FOLDER)
示例#8
0
    def __init__(self, input_types, output_types, *args, **kwargs):
        """
        Initialize the IDLookupAPI object.
        """
        self._generate_return_fields()
        super(DataTransformAPI, self).__init__(input_types, output_types,
                                               *args, **kwargs)

        # default value of None for client
        self.client = None

        # Keep track of one_to_many relationships
        self.one_to_many_cnt = 0

        self.logger, _ = get_logger('keylookup_api')
示例#9
0
def upload_worker(name, storage_class, loaddata_func, col_name, batch_size,
                  batch_num, *args):
    """
    Pickable job launcher, typically running from multiprocessing.
    storage_class will instanciate with col_name, the destination
    collection name. loaddata_func is the parsing/loading function,
    called with `*args`.
    """
    data = []
    try:
        data = loaddata_func(*args)
        if type(storage_class) is tuple:
            klass_name = "_".join(
                [k.__class__.__name__ for k in storage_class])
            storage = type(klass_name, storage_class, {})(None, col_name,
                                                          loggingmod)
        else:
            storage = storage_class(None, col_name, loggingmod)
        return storage.process(data, batch_size)
    except Exception as e:
        logger_name = "%s_batch_%s" % (name, batch_num)
        logger, logfile = get_logger(logger_name, config.LOG_FOLDER)
        logger.exception(e)
        logger.error("Parameters:\nname=%s\nstorage_class=%s\n" %
                     (name, storage_class) +
                     "loaddata_func=%s\ncol_name=%s\nbatch_size=%s\n" %
                     (loaddata_func, col_name, batch_size) +
                     "args=%s" % repr(args))
        import pickle
        pickfile = os.path.join(os.path.dirname(logfile),
                                "%s.pick" % logger_name)
        try:
            pickle.dump(
                {
                    "exc": e,
                    "params": {
                        "name": name,
                        "storage_class": storage_class
                    },
                    "loaddata_func": loaddata_func,
                    "col_name": col_name,
                    "batch_size": batch_size,
                    "args": args
                }, open(pickfile, "wb"))
        except TypeError as ie:
            logger.warning("Could not pickle batch errors: %s" % ie)
        raise e
示例#10
0
def upload_worker(name, storage_class, loaddata_func, col_name,
                  batch_size, batch_num, *args):
    """
    Pickable job launcher, typically running from multiprocessing.
    storage_class will instanciate with col_name, the destination 
    collection name. loaddata_func is the parsing/loading function,
    called with *args
    """
    try:
        data = loaddata_func(*args)
        storage = storage_class(None,col_name,loggingmod)
        return storage.process(data,batch_size)
    except Exception as e:
        logger_name = "%s_batch_%s" % (name,batch_num)
        logger = get_logger(logger_name, config.LOG_FOLDER)
        logger.exception(e)
        raise
示例#11
0
 def __init__(self, source_list, features=None, name="BioThings Hub",
              managers_custom_args={}, api_config=None, reloader_config=None,
              dataupload_config=None, websocket_config=None):
     """
     Helper to setup and instantiate common managers usually used in a hub
     (eg. dumper manager, uploader manager, etc...)
     "source_list" is either:
         - a list of string corresponding to paths to datasources modules
         - a package containing sub-folders with datasources modules
     Specific managers can be retrieved adjusting "features" parameter, where
     each feature corresponds to one or more managers. Parameter defaults to
     all possible available. Managers are configured/init in the same order as the list,
     so if a manager (eg. job_manager) is required by all others, it must be the first
     in the list.
     "managers_custom_args" is an optional dict used to pass specific arguments while
     init managers:
         managers_custom_args={"upload" : {"poll_schedule" : "*/5 * * * *"}}
     will set poll schedule to check upload every 5min (instead of default 10s)
     "reloader_config", "dataupload_config" and "websocket_config" can be used to
     customize reloader, dataupload and websocket. If None, default config is used.
     If explicitely False, feature is deactivated.
     """
     self.name = name
     self.source_list = source_list
     self.logger, self.logfile = get_logger("hub")
     self._passed_features = features
     self._passed_managers_custom_args = managers_custom_args
     self.features = self.clean_features(features or self.DEFAULT_FEATURES)
     self.managers_custom_args = managers_custom_args
     self.reloader_config = reloader_config or self.DEFAULT_RELOADER_CONFIG
     self.dataupload_config = dataupload_config or self.DEFAULT_DATAUPLOAD_CONFIG
     self.websocket_config = websocket_config or self.DEFAULT_WEBSOCKET_CONFIG
     self.ws_listeners = [] # collect listeners that should be connected (push data through) to websocket
     self.api_config = api_config or self.DEFAULT_API_CONFIG
     # set during configure()
     self.managers = None
     self.api_endpoints = None
     self.shell = None
     self.commands = None
     self.extra_commands = None
     self.routes = []
     # flag "do we need to configure?"
     self.configured = False
示例#12
0
    def __init__(self, build_doc, indexer_env, index_name):

        # build_doc primarily describes the source.
        # indexer_env primarily describes the destination.

        _build_doc = _BuildDoc(build_doc)
        _build_backend = _build_doc.parse_backend()

        # ----------source----------

        self.mongo_client_args = _build_backend.args
        self.mongo_database_name = _build_backend.dbs
        self.mongo_collection_name = _build_backend.col

        # -----------dest-----------

        # [1] https://elasticsearch-py.readthedocs.io/en/v7.12.0/api.html#elasticsearch.Elasticsearch
        # [2] https://elasticsearch-py.readthedocs.io/en/v7.12.0/helpers.html#elasticsearch.helpers.bulk
        self.es_client_args = indexer_env.get("args",
                                              {})  # See [1] for available args
        self.es_blkidx_args = indexer_env.get("bulk",
                                              {})  # See [2] for available args
        self.es_index_name = index_name or _build_doc.build_name
        self.es_index_settings = IndexSettings(
            deepcopy(DEFAULT_INDEX_SETTINGS))
        self.es_index_mappings = IndexMappings(
            deepcopy(DEFAULT_INDEX_MAPPINGS))

        _build_doc.enrich_settings(self.es_index_settings)
        _build_doc.enrich_mappings(self.es_index_mappings)

        # -----------info-----------

        self.env_name = indexer_env.get("name")
        self.conf_name = _build_doc.build_config.get("name")
        self.build_name = _build_doc.build_name

        self.logger, self.logfile = get_logger('index_%s' % self.es_index_name)
        self.pinfo = ProcessInfo(self, indexer_env.get("concurrency", 10))
示例#13
0
    def __init__(self, *args, **kwargs):
        """
        An example of config dict for this module.
        {
            "indexer_select": {
                None: "hub.dataindex.indexer.DrugIndexer", # default
                "build_config.cold_collection" : "mv.ColdHotVariantIndexer",
            },
            "env": {
                "prod": {
                    "host": "localhost:9200",
                    "indexer": {
                        "args": {
                            "timeout": 300,
                            "retry_on_timeout": True,
                            "max_retries": 10,
                        },
                        "bulk": {
                            "chunk_size": 50
                            "raise_on_exception": False
                        },
                        "concurrency": 3
                    },
                    "index": [
                        # for information only, only used in index_info
                        {"index": "mydrugs_current", "doc_type": "drug"},
                        {"index": "mygene_current", "doc_type": "gene"}
                    ],
                },
                "dev": { ... }
            }
        }
        """
        super().__init__(*args, **kwargs)
        self._srcbuild = get_src_build()
        self._config = {}

        self.logger, self.logfile = get_logger('indexmanager')
示例#14
0
 def setup_log(self):
     self.logger, self.logfile = get_logger("syncmanager")
示例#15
0
 def setup_log(self):
     return get_logger('upload_%s' % self.fullname)
示例#16
0
 def setup_log(self):
     self.logger, self.logfile = get_logger('sync')
示例#17
0
 def setup_log(self):
     self.logger, _ = get_logger('apimanager')
 def setup_log(self):
     self.logger,_ = get_logger('keylookup')
示例#19
0
 def setup_log(self):
     """Setup and return a logger instance"""
     self.logger, self.logfile = get_logger('assistant_%s' %
                                            self.__class__.plugin_type)
示例#20
0
 def setup_log(self):
     """Setup and return a logger instance"""
     self.logger, self.logfile = get_logger('inspect')
示例#21
0
 def setup_log(self):
     self.logger, self.logfile = get_logger(SNAPSHOOTER_CATEGORY,
                                            self.log_folder)
示例#22
0
 def setup_log(self):
     """Setup and return a logger instance"""
     self.logger, self.logfile = get_logger('assistantmanager')
示例#23
0
 def setup_log(self):
     """Setup and return a logger instance"""
     self.logger, self.logfile = get_logger('loader_%s' % self.plugin_name)
示例#24
0
 def setup_log(self):
     self.logger, self.logfile = get_logger('indexmanager', self.log_folder)
示例#25
0
 def setup_log(self):
     self.logger, self.logfile = get_logger('index_%s' % self.index_name,
                                            self.log_folder)
示例#26
0
 def setup_log(self):
     """setup the logger member variable"""
     self.logger, _ = get_logger('datatransform')
示例#27
0
def _ensure_logger(logger):
    if not logger:
        return logging.getLogger(__name__)
    if isinstance(logger, str):
        return get_logger(logger)[0]
    return logger
示例#28
0
import copy
import re

from biothings.hub.datatransform.datatransform import DataTransform
from networkx import all_simple_paths, nx
import biothings.utils.mongo as mongo
from biothings.utils.loggers import get_logger
from biothings import config as btconfig
from biothings import config_for_app

# Configuration of collections from biothings config file
config_for_app(btconfig)

# Setup logger and logging level
kl_log = get_logger('keylookup', btconfig.LOG_FOLDER)


class DataTransformSerial(DataTransform):
    # Constants
    DEFAULT_WEIGHT = 1
    default_source = '_id'

    def __init__(self,
                 G,
                 collections,
                 input_types,
                 output_types,
                 skip_on_failure=False,
                 skip_w_regex=None):
        """
        Initialize the keylookup object and precompute paths from the
示例#29
0
 def setup_log(self):
     self.logger, self.logfile = get_logger("dump_%s" % self.src_name)