Python Loader примеры, mrtarget.common.ElasticsearchLoader.Loader Python примеры использования

Пример #1

0

Показать файл

Файл: DataDrivenRelation.py Проект: pieterlukasse/data_pipeline-1

    def process_all(self, dry_run, 
            ddr_workers_production,
            ddr_workers_score,
            ddr_queue_production_score,
            ddr_queue_score_result):
        start_time = time.time()

        target_data, disease_data = self.es_query.get_disease_to_targets_vectors()

        self.logger.info('Retrieved all the associations data in %i s'%(time.time()-start_time))
        self.logger.info('target data length: %s size in memory: %f Kb'%(len(target_data),sys.getsizeof(target_data)/1024.))
        self.logger.info('disease data length: %s size in memory: %f Kb' % (len(disease_data),sys.getsizeof(disease_data)/1024.))

        '''sort the lists and keep using always the same order in all the steps'''
        disease_keys = sorted(disease_data.keys())
        target_keys = sorted(target_data.keys())

        self.logger.info('getting disese labels')
        disease_id_to_label = self.es_query.get_disease_labels(disease_keys)
        disease_labels = [disease_id_to_label[hit_id] for hit_id in disease_keys]
        self.logger.info('getting target labels')
        target_id_to_label = self.es_query.get_target_labels(target_keys)
        target_labels = [target_id_to_label[hit_id] for hit_id in target_keys]

        #setup elasticsearch
        self.loader = Loader(self.es, dry_run=dry_run)
        if not dry_run:
            #need to directly get the versioned index name for this function
            self.loader.create_new_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME)
            self.loader.prepare_for_bulk_indexing(self.loader.get_versioned_index(Const.ELASTICSEARCH_RELATION_INDEX_NAME))


        #calculate and store disease-to-disease in multiple processess
        self.logger.info('handling disease-to-disease')
        handle_pairs(RelationType.SHARED_TARGET, disease_labels, disease_data, disease_keys, 
            target_keys, 0.19, 1024, self.loader, dry_run, 
            ddr_workers_production, ddr_workers_score, 
            ddr_queue_production_score, ddr_queue_score_result)
        self.logger.info('handled disease-to-disease')

        #calculate and store target-to-target in multiple processess
        self.logger.info('handling target-to-target')
        handle_pairs(RelationType.SHARED_DISEASE, target_labels, target_data, target_keys, 
            disease_keys, 0.19, 1024, self.loader, dry_run, 
            ddr_workers_production, ddr_workers_score, 
            ddr_queue_production_score, ddr_queue_score_result)
        self.logger.info('handled target-to-target')

        #cleanup elasticsearch
        if not dry_run:
            self.loader.flush_all_and_wait(Const.ELASTICSEARCH_RELATION_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.loader.restore_after_bulk_indexing()

Пример #2

0

Показать файл

Файл: Association.py Проект: pieterlukasse/data_pipeline-1

    def __init__(self, redis_host, redis_port, es_hosts):

        self.logger = logging.getLogger(__name__)

        self.es_hosts = es_hosts
        self.es = new_es_client(self.es_hosts)
        self.es_loader = Loader(self.es)
        self.es_query = ESQuery(self.es)

        self.redis_host = redis_host
        self.redis_port = redis_port
        self.r_server = new_redis_client(self.redis_host, self.redis_port)

Пример #3

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

    def get_all_evidence_for_datatype(
        self,
        datatype,
        fields=None,
    ):
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-multi-get.html
        index_name = Loader.get_versioned_index(
            Const.ELASTICSEARCH_DATA_INDEX_NAME, True)
        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "match": {
                        "type": datatype
                    }
                },
                '_source': self._get_source_from_fields(fields),
                'size': 1000,
            },
            scroll='12h',
            index=index_name,
            timeout="10m",
        )

        # res = list(res)
        for hit in res:
            yield hit['_source']

Пример #4

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

    def get_validated_evidence_strings(self,
                                       size=1000,
                                       datasources=[],
                                       is_valid=True):
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-multi-get.html
        index_name = Loader.get_versioned_index(
            Const.ELASTICSEARCH_VALIDATED_DATA_INDEX_NAME + '*', True)

        doc_type = None
        if datasources:
            doc_type = datasources

        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "match_phrase": {
                        "is_valid": is_valid
                    }
                },
                '_source': True,
                'size': size,
            },
            scroll='12h',
            doc_type=doc_type,
            index=index_name,
            timeout="20m",
        )

        for hit in res:
            yield hit['_source']

Пример #5

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

    def get_associations_for_disease(self,
                                     disease,
                                     fields=None,
                                     size=100,
                                     get_top_hits=True):
        source = self._get_source_from_fields(fields)

        aggs = addict.Dict()
        if get_top_hits:
            aggs.direct_associations.filter.term.is_direct = True
            aggs.direct_associations.aggs.top_direct_ass.top_hits.sort[
                'harmonic-sum.overall'].order = 'desc'
            aggs.direct_associations.aggs.top_direct_ass.top_hits._source = source
            aggs.direct_associations.aggs.top_direct_ass.top_hits.size = size

        q = addict.Dict()
        q.query.constant_score.filter.terms['disease.id'] = [disease]
        q.sort['harmonic-sum.overall'].order = 'desc'
        q._source = source
        q.aggs = aggs
        q.size = size

        res = self.handler.search(
            index=Loader.get_versioned_index(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True),
            doc_type=Const.ELASTICSEARCH_DATA_ASSOCIATION_DOC_NAME,
            body=q.to_dict())
        return AssociationSummary(res)

Пример #6

0

Показать файл

Файл: EvidencesHelpers.py Проект: pieterlukasse/data_pipeline-1

def setup_writers(dry_run, es_hosts, output_folder):
    global_init = None
    local_init = None
    main = None
    local_shutdown = None
    global_shutdown = None

    if dry_run:
        main = dry_run_main
    elif es_hosts:
        #have to bake the loader object in so that the prepare for bulk indexing works
        es_loader = Loader(new_es_client(es_hosts))
        #use partial to "bake" arguments into the function we return
        global_init = functools.partial(elasticsearch_global_init, es_loader)
        local_init = functools.partial(elasticsearch_local_init, es_hosts)
        main = elasticsearch_main
        local_shutdown = elasticsearch_local_shutdown
        global_shutdown = functools.partial(elasticsearch_global_shutdown,
                                            es_loader)
    elif output_folder:
        #use partial to "bake" arguments into the function we return
        global_init = functools.partial(file_global_init, output_folder)
        local_init = functools.partial(file_local_init, output_folder)
        main = file_main
        local_shutdown = file_local_shutdown
    else:
        raise ValueError(
            "Must specify one of dry_run, es_hosts, output_folder")

    return global_init, local_init, main, local_shutdown, global_shutdown

Пример #7

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

    def get_disease_to_targets_vectors(self, treshold=0.1, evidence_count=3):
        '''
        Get all the association objects that are:
        - direct -> to avoid ontology inflation
        - > 3 evidence count -> remove noise
        - overall score > threshold -> remove very lo quality noise
        :param treshold: minimum overall score threshold to consider for fetching association data
        :param evidence_count: minimum number of evidence consider for fetching association data
        :return: two dictionaries mapping target to disease  and the reverse
        '''
        self.logger.debug('scan es to get all diseases and targets')
        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "term": {
                        "is_direct": True,
                    }
                },
                '_source': {
                    'includes': [
                        "target.id", 'disease.id', 'harmonic-sum',
                        'evidence_count'
                    ]
                },
                'size': 1000,
            },
            scroll='12h',
            index=Loader.get_versioned_index(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True),
            timeout="10m",
        )

        target_results = dict()
        disease_results = dict()

        self.logger.debug('start getting all targets and diseases from es')
        c = 0
        for hit in res:
            c += 1
            hit = hit['_source']
            if hit['evidence_count']['total']>=evidence_count and \
                hit['harmonic-sum']['overall'] >=treshold:
                '''store target associations'''
                if hit['target']['id'] not in target_results:
                    target_results[hit['target']['id']] = SparseFloatDict()
                #TODO: return all counts and scores up to datasource level
                target_results[hit['target']['id']][
                    hit['disease']['id']] = hit['harmonic-sum']['overall']
                '''store disease associations'''
                if hit['disease']['id'] not in disease_results:
                    disease_results[hit['disease']['id']] = SparseFloatDict()
                # TODO: return all counts and scores up to datasource level
                disease_results[hit['disease']['id']][
                    hit['target']['id']] = hit['harmonic-sum']['overall']

                if c % 10000 == 0:
                    self.logger.debug('%d elements retrieved', c)

        return target_results, disease_results

Пример #8

0

Показать файл

Файл: Association.py Проект: pieterlukasse/data_pipeline-1

def score_producer_local_init(es_hosts, redis_host, redis_port, lookup_data,
                              datasources_to_datatypes, dry_run):

    #set the R server to lookup into
    r_server = new_redis_client(redis_host, redis_port)

    scorer = Scorer()

    loader = Loader(new_es_client(es_hosts))

    return scorer, loader, r_server, lookup_data, datasources_to_datatypes, dry_run

Пример #9

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

    def get_evidence_for_target_simple(self, target, expected=None):
        query_body = {
            "query": {
                "constant_score": {
                    "filter": {
                        "term": {
                            "target.id": target
                        }
                    }
                }
            },
            '_source': {
                "includes": [
                    "target.id",
                    "private.efo_codes",
                    "disease.id",
                    "scores.association_score",
                    "sourceID",
                    "id",
                ]
            },
        }

        if expected is not None and expected < 10000:
            query_body['size'] = 10000
            res = self.handler.search(index=Loader.get_versioned_index(
                Const.ELASTICSEARCH_DATA_INDEX_NAME, True),
                                      body=query_body)
            for hit in res['hits']['hits']:
                yield hit['_source']
        else:
            res = helpers.scan(client=self.handler,
                               query=query_body,
                               scroll='1h',
                               index=Loader.get_versioned_index(
                                   Const.ELASTICSEARCH_DATA_INDEX_NAME, True),
                               timeout="1h",
                               request_timeout=2 * 60 * 60,
                               size=1000)
            for hit in res:
                yield hit['_source']

Пример #10

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

 def count_elements_in_index(self, index_name, doc_type=None, query=None):
     if query is None:
         query = {"match_all": {}}
     res = self.handler.search(index=Loader.get_versioned_index(
         index_name, True),
                               doc_type=doc_type,
                               body={
                                   "query": query,
                                   '_source': False,
                                   'size': 0,
                               })
     return res['hits']['total']

Пример #11

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

 def get_reaction(self, reaction_id):
     res = self.handler.search(
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_REACTOME_INDEX_NAME, True),
         doc_type=Const.ELASTICSEARCH_REACTOME_REACTION_DOC_NAME,
         body={
             "query": {
                 "ids": {
                     "values": [reaction_id]
                 }
             },
             '_source': True,
             'size': 1,
         })
     for hit in res['hits']['hits']:
         return hit['_source']

Пример #12

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

 def get_all_associations(self, ):
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': True,
             'size': 1000,
         },
         scroll='1h',
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME, True),
         timeout="10m",
     )
     for hit in res:
         yield hit['_source']

Пример #13

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

 def count_evidence_for_target(self, target):
     res = self.handler.search(index=Loader.get_versioned_index(
         Const.ELASTICSEARCH_DATA_INDEX_NAME, True),
                               body={
                                   "query": {
                                       "constant_score": {
                                           "filter": {
                                               "term": {
                                                   "target.id": target
                                               }
                                           }
                                       }
                                   },
                                   '_source': [],
                                   'size': 0
                               })
     return res['hits']['total']

Пример #14

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

 def get_all_ensembl_genes(self):
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': True,
             'size': 1000,
         },
         scroll='1h',
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME, True),
         timeout="10m",
     )
     for hit in res:
         yield hit['_source']

Пример #15

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

 def get_all_uniprot_entries(self):
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': True,
             'size': 100,
         },
         scroll='12h',
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_UNIPROT_INDEX_NAME, True),
         timeout="10m",
     )
     for hit in res:
         yield jsonpickle.decode(base64.b64decode(hit['_source']['entry']))

Пример #16

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

 def get_all_reactions(self):
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': True,
             'size': 1000,
         },
         scroll='1h',
         doc_type=Const.ELASTICSEARCH_REACTOME_REACTION_DOC_NAME,
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_REACTOME_INDEX_NAME, True),
         timeout="10m",
     )
     for hit in res:
         yield hit['_source']

Пример #17

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

 def get_all_diseases(self, fields=None):
     source = self._get_source_from_fields(fields)
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': source,
             'size': 1000,
         },
         scroll='12h',
         doc_type=Const.ELASTICSEARCH_EFO_LABEL_DOC_NAME,
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME, True),
         timeout="10m",
     )
     for hit in res:
         yield hit['_source']

Пример #18

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

 def get_all_target_ids_with_evidence_data(self):
     #TODO: use an aggregation to get those with just data
     res = helpers.scan(
         client=self.handler,
         query={
             "query": {
                 "match_all": {}
             },
             '_source': False,
             'size': 100,
         },
         scroll='12h',
         doc_type=Const.ELASTICSEARCH_GENE_NAME_DOC_NAME,
         index=Loader.get_versioned_index(
             Const.ELASTICSEARCH_GENE_NAME_INDEX_NAME, True),
         timeout="30m",
     )
     for target in res:
         yield target['_id']

Пример #19

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

    def get_disease_labels(self, ids):
        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "ids": {
                        "values": ids,
                    }
                },
                '_source': 'label',
                'size': 1,
            },
            scroll='12h',
            index=Loader.get_versioned_index(
                Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME, True),
            timeout="10m",
        )

        return dict((hit['_id'], hit['_source']['label']) for hit in res)

Пример #20

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

    def get_all_target_disease_pair_from_evidence(self, only_direct=False):

        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "match_all": {}
                },
                '_source':
                self._get_source_from_fields([
                    'target.id', 'disease.id', 'private.efo_codes',
                    'scores.association_score'
                ]),
                'size':
                1000,
            },
            scroll='6h',
            index=Loader.get_versioned_index(
                Const.ELASTICSEARCH_DATA_INDEX_NAME, True),
            timeout="1h",
            request_timeout=2 * 60 * 60,
        )

        yielded_pairs = set()
        for hit in res:
            if hit['_source']['scores']['association_score'] > 0:
                if only_direct:
                    pair = '-'.join([
                        hit['_source']['target']['id'],
                        hit['_source']['disease']['id']
                    ])
                    if pair not in yielded_pairs:
                        yield pair
                        yielded_pairs.add(pair)
                else:
                    for efo_id in hit['_source']['private']['efo_codes']:
                        pair = '-'.join(
                            [hit['_source']['target']['id'], efo_id])
                        if pair not in yielded_pairs:
                            yield pair
                            yielded_pairs.add(pair)

Пример #21

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

    def get_all_evidence(self, fields=None):
        index_name = Loader.get_versioned_index(
            Const.ELASTICSEARCH_DATA_INDEX_NAME, True)
        doc_type = None
        res = helpers.scan(
            client=self.handler,
            query={
                "query": {
                    "match_all": {}
                },
                '_source': self._get_source_from_fields(fields),
                'size': 1000,
            },
            scroll='12h',
            index=index_name,
            timeout="10m",
        )

        # res = list(res)
        for hit in res:
            yield hit['_source']

Пример #22

0

Показать файл

Файл: EvidencesHelpers.py Проект: pieterlukasse/data_pipeline-1

def elasticsearch_local_init(es_hosts):
    return Loader(new_es_client(es_hosts)),

Пример #23

0

Показать файл

Файл: CommandLine.py Проект: pieterlukasse/data_pipeline-1

def main():
    #parse config file, environment, and command line arguments
    mrtarget.cfg.setup_ops_parser()
    args = mrtarget.cfg.get_ops_args()

    #set up logging
    logger = None
    if args.log_config:
        if os.path.isfile(args.log_config) and os.access(
                args.log_config, os.R_OK):
            #read a log configuration file
            logging.config.fileConfig(args.log_config,
                                      disable_existing_loggers=False)
            logger = logging.getLogger(__name__ + ".main()")
        else:
            #unable to read the logging config file, abort
            logging.basicConfig()
            logger = logging.getLogger(__name__ + ".main()")
            logger.error("unable to read file {}".format(args.log_config))
            return 1
    else:
        #no logging config specified, fall back to default
        logging.basicConfig()
        logger = logging.getLogger(__name__ + ".main()")

    if not args.release_tag:
        logger.error('A [release-tag] has to be specified.')
        print('A [release-tag] has to be specified.', file=sys.stderr)
        return 1
    else:
        Config.RELEASE_VERSION = args.release_tag
        logger.info('setting release version %s' % Config.RELEASE_VERSION)

    with RedisManager(args.redis_remote, args.redis_host, args.redis_port):

        es = new_es_client(args.elasticseach_nodes)
        redis = new_redis_client(args.redis_host, args.redis_port)

        #create a single query object for future use
        esquery = ESQuery(es)

        #read the data configuration
        data_config = mrtarget.cfg.get_data_config(args.data_config)

        #create something to accumulate qc metrics into over various steps
        qc_metrics = QCMetrics()

        with Loader(es,
                    chunk_size=ElasticSearchConfiguration.bulk_load_chunk,
                    dry_run=args.dry_run) as loader:

            if args.rea:
                process = ReactomeProcess(
                    loader, data_config.reactome_pathway_data,
                    data_config.reactome_pathway_relation)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.ens:
                process = EnsemblProcess(loader)
                if not args.qc_only:
                    process.process(data_config.ensembl_filename, args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.unic:
                process = UniprotDownloader(loader)
                if not args.qc_only:
                    process.process(data_config.uniprot_uri, args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.hpa:
                process = HPAProcess(loader, redis, args.elasticseach_nodes,
                                     data_config.tissue_translation_map,
                                     data_config.tissue_curation_map,
                                     data_config.hpa_normal_tissue,
                                     data_config.hpa_rna_level,
                                     data_config.hpa_rna_value,
                                     data_config.hpa_rna_zscore)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))

            if args.gen:
                process = GeneManager(
                    loader,
                    redis,
                    args.gen_plugin_places,
                    data_config.gene_data_plugin_names,
                )
                if not args.qc_only:
                    process.merge_all(data_config, dry_run=args.dry_run)

                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))

            if args.efo:
                process = EfoProcess(loader, data_config.ontology_efo,
                                     data_config.ontology_hpo,
                                     data_config.ontology_mp,
                                     data_config.disease_phenotype)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
            if args.eco:
                process = EcoProcess(loader, data_config.ontology_eco,
                                     data_config.ontology_so)
                if not args.qc_only:
                    process.process_all(args.dry_run)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))

            if args.val:
                es_output_folder = None
                if "elasticsearch_folder" in vars(
                        args) and args.elasticsearch_folder is not None:
                    es_output_folder = args.elasticsearch_folder

                process_evidences_pipeline(
                    filenames=data_config.input_file,
                    first_n=args.val_first_n,
                    es_client=es,
                    redis_client=redis,
                    dry_run=args.dry_run,
                    output_folder=es_output_folder,
                    num_workers=args.val_workers_validator,
                    num_writers=args.val_workers_writer,
                    max_queued_events=args.val_queue_validator_writer,
                    eco_scores_uri=data_config.eco_scores,
                    schema_uri=data_config.schema,
                    es_hosts=args.elasticseach_nodes,
                    excluded_biotypes=data_config.excluded_biotypes,
                    datasources_to_datatypes=data_config.
                    datasources_to_datatypes)

                #TODO qc

            if args.assoc:
                process = ScoringProcess(args.redis_host, args.redis_port,
                                         args.elasticseach_nodes)
                if not args.qc_only:
                    process.process_all(data_config.scoring_weights,
                                        data_config.is_direct_do_not_propagate,
                                        data_config.datasources_to_datatypes,
                                        args.dry_run,
                                        args.as_workers_production,
                                        args.as_workers_score,
                                        args.as_queue_production_score)
                if not args.skip_qc:
                    qc_metrics.update(process.qc(esquery))
                    pass

            if args.ddr:
                process = DataDrivenRelationProcess(es)
                if not args.qc_only:
                    process.process_all(args.dry_run,
                                        args.ddr_workers_production,
                                        args.ddr_workers_score,
                                        args.ddr_queue_production_score,
                                        args.ddr_queue_score_result)
                #TODO qc

            if args.sea:
                process = SearchObjectProcess(loader, redis)
                if not args.qc_only:
                    process.process_all(
                        data_config.chembl_target,
                        data_config.chembl_mechanism,
                        data_config.chembl_component,
                        data_config.chembl_protein,
                        data_config.chembl_molecule_set_uri_pattern,
                        args.dry_run)
                #TODO qc

            if args.metric:
                process = Metrics(
                    es, args.metric_file,
                    data_config.datasources_to_datatypes).generate_metrics()

    if args.qc_in:
        #handle reading in previous qc from filename provided, and adding comparitive metrics
        qc_metrics.compare_with(args.qc_in)

    if args.qc_out:
        #handle writing out to a tsv file
        qc_metrics.write_out(args.qc_out)

    logger.info('`' + " ".join(sys.argv) + '` - finished')
    return 0

Пример #24

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

 def exists(self, index, doc_type, id, realtime=False):
     return self.handler.exists(index=Loader.get_versioned_index(
         index, True),
                                doc_type=doc_type,
                                id=id,
                                realtime=realtime)

Пример #25

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

    def delete_data(self,
                    index,
                    query,
                    doc_type='',
                    chunk_size=1000,
                    altered_keys=()):
        '''
        Delete all the documents in an index matching a given query
        :param index: index to use
        :param query: query matching the elements to remove
        :param doc_type: document types, default is to look for all the doc types
        :param chunk_size: size of the bulk action sent to delete
        :param altered_keys: list of fields to fetch data and return as being altered by the delete query
        :return: dict of keys altered by the query
        '''
        '''count available data'''
        res = self.handler.search(
            index=Loader.get_versioned_index(index, True),
            body={
                "query": query,
                '_source': False,
                'size': 0,
            },
            doc_type=doc_type,
        )
        total = res['hits']['total']
        '''if data is matching query, delete it with scan and bulk'''
        altered = dict()
        for key in altered_keys:
            altered[key] = set()
        if total:
            batch = []
            for hit in helpers.scan(
                    client=self.handler,
                    query={
                        "query": query,
                        '_source': self._get_source_from_fields(altered_keys),
                        'size': chunk_size,
                    },
                    scroll='1h',
                    index=Loader.get_versioned_index(index, True),
                    doc_type=doc_type,
                    timeout='1h',
            ):
                action = {
                    '_op_type': 'delete',
                    '_index': hit['_index'],
                    '_type': hit['_type'],
                    '_id': hit['_id'],
                }
                batch.append(action)
                flat_source = self.flatten(hit['_source'])
                for key in altered_keys:
                    if key in flat_source:
                        altered[key].add(flat_source[key])
                if len(batch) >= chunk_size:
                    self._flush_bulk(batch)
                    batch = []

            #if len(batch) >= chunk_size:
            self._flush_bulk(batch)
            '''flush changes'''
            self.handler.indices.flush(Loader.get_versioned_index(index, True),
                                       wait_if_ongoing=True)

        return altered

Пример #26

0

Показать файл

Файл: ElasticsearchQuery.py Проект: pieterlukasse/data_pipeline-1

    def get_objects_by_id(self,
                          ids,
                          index,
                          doc_type,
                          source=True,
                          source_exclude=[],
                          realtime=False):
        '''

        :param ids: list of idientifiers for documents
        :param index: index for all the documents
        :param doc_type: doc type for all the documents
        :return: generator of documents
        '''
        if isinstance(ids, (list, tuple)):
            res = self.handler.mget(
                index=Loader.get_versioned_index(index, True),
                doc_type=doc_type,
                body=dict(ids=ids),
                _source=source,
                _source_exclude=source_exclude,
                realtime=True,
            )
            if not res:
                time.sleep(0.1)
                res = self.handler.mget(
                    index=Loader.get_versioned_index(index, True),
                    doc_type=doc_type,
                    body=dict(ids=ids),
                    _source=source,
                    _source_exclude=source_exclude,
                    realtime=True,
                )
            for doc in res['docs']:
                if doc['found']:
                    yield doc['_source']
                else:
                    raise KeyError('object with id %s not found' %
                                   (doc['_id']))

        else:

            try:
                res = self.handler.get(
                    index=Loader.get_versioned_index(index, True),
                    doc_type=doc_type,
                    id=ids,
                    _source=source,
                    _source_exclude=source_exclude,
                    realtime=True,
                )
                try:
                    yield res['_source']
                except Exception as e:
                    self.logger.exception(
                        'cannot retrieve single object by id %s ' % ids)
                    raise KeyError('object with id %s not found' % ids)

            except TransportError as te:
                if te.status_code == 404:
                    raise KeyError('object with id %s not found' % ids)

Пример #27

0

Показать файл

Файл: Association.py Проект: pieterlukasse/data_pipeline-1

class ScoringProcess():
    def __init__(self, redis_host, redis_port, es_hosts):

        self.logger = logging.getLogger(__name__)

        self.es_hosts = es_hosts
        self.es = new_es_client(self.es_hosts)
        self.es_loader = Loader(self.es)
        self.es_query = ESQuery(self.es)

        self.redis_host = redis_host
        self.redis_port = redis_port
        self.r_server = new_redis_client(self.redis_host, self.redis_port)

    def process_all(self, scoring_weights, is_direct_do_not_propagate,
                    datasources_to_datatypes, dry_run, num_workers_produce,
                    num_workers_score, max_queued_produce_to_score):

        lookup_data = LookUpDataRetriever(
            self.es,
            self.r_server,
            targets=[],
            data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET,
                        LookUpDataType.ECO, LookUpDataType.HPA)).lookup

        targets = list(self.es_query.get_all_target_ids_with_evidence_data())

        #setup elasticsearch
        if not dry_run:
            self.es_loader.create_new_index(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            self.es_loader.prepare_for_bulk_indexing(
                self.es_loader.get_versioned_index(
                    Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME))

        self.logger.info('setting up stages')

        #bake the arguments for the setup into function objects
        produce_evidence_local_init_baked = functools.partial(
            produce_evidence_local_init, self.es_hosts, scoring_weights,
            is_direct_do_not_propagate, datasources_to_datatypes)
        score_producer_local_init_baked = functools.partial(
            score_producer_local_init, self.es_hosts, self.redis_host,
            self.redis_port, lookup_data, datasources_to_datatypes, dry_run)

        #this doesn't need to be in the external config, since its so content light
        #as to be meaningless
        max_queued_score_out = 10000

        #pipeline stage for making the lists of the target/disease pairs and evidence
        pipeline_stage = pr.flat_map(
            produce_evidence,
            targets,
            workers=num_workers_produce,
            maxsize=max_queued_produce_to_score,
            on_start=produce_evidence_local_init_baked,
            on_done=produce_evidence_local_shutdown)

        #pipeline stage for scoring the evidence sets
        #includes writing to elasticsearch
        pipeline_stage = pr.each(score_producer,
                                 pipeline_stage,
                                 workers=num_workers_score,
                                 maxsize=max_queued_score_out,
                                 on_start=score_producer_local_init_baked,
                                 on_done=score_producer_local_shutdown)

        #loop over the end of the pipeline to make sure everything is finished
        self.logger.info('stages created, running scoring and writing')
        pr.run(pipeline_stage)
        self.logger.info('stages created, ran scoring and writing')

        #cleanup elasticsearch
        if not dry_run:
            self.logger.info('flushing data to index')
            self.es_loader.flush_all_and_wait(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.es_loader.restore_after_bulk_indexing()
            self.logger.info('flushed data to index')

        self.logger.info("DONE")

    """
    Run a series of QC tests on EFO elasticsearch index. Returns a dictionary
    of string test names and result objects
    """

    def qc(self, esquery):

        #number of eco entries
        association_count = 0
        #Note: try to avoid doing this more than once!
        for association in esquery.get_all_associations():
            association_count += 1
            if association_count % 1000 == 0:
                self.logger.debug("checking %d", association_count)

        #put the metrics into a single dict
        metrics = dict()
        metrics["association.count"] = association_count

        return metrics

Python Loader примеры использования