Exemplo n.º 1
0
def handle_pairs(type, subject_labels, subject_data, subject_ids, other_ids,
                 threshold, buckets_number, es, dry_run, workers_production,
                 workers_score, workers_write, queue_production_score,
                 queue_score_result, queue_write, index, doc):

    #do some initial setup
    vectorizer = DictVectorizer(sparse=True)
    tdidf_transformer = LocalTfidfTransformer(smooth_idf=False, )
    data_vector = vectorizer.fit_transform(
        [subject_data[i] for i in subject_ids])
    data_vector = data_vector > 0
    data_vector = data_vector.astype(int)
    transformed_data = tdidf_transformer.fit_transform(data_vector)
    sums_vector = np.squeeze(np.asarray(
        transformed_data.sum(1)).ravel())  #sum by row
    '''put vectors in buckets'''
    buckets = {}
    for i in range(buckets_number):
        buckets[i] = []
    vector_hashes = {}
    for i in range(len(subject_ids)):
        vector = transformed_data[i].toarray()[0]
        digested = digest_in_buckets(vector, buckets_number)
        for bucket in digested:
            buckets[bucket].append(i)
        vector_hashes[i] = digested

    idf = dict(zip(vectorizer.feature_names_, list(tdidf_transformer.idf_)))
    idf_ = 1 - tdidf_transformer.idf_

    #now everything is computed that can be baked into the function arguments

    produce_pairs_local_init_baked = functools.partial(
        produce_pairs_local_init, vector_hashes, buckets, threshold,
        sums_vector, data_vector)

    calculate_pairs_local_init_baked = functools.partial(
        calculate_pairs_local_init, type, subject_labels, subject_ids,
        other_ids, threshold, idf, idf_)

    #create stage for producing disease-to-disease
    pipeline_stage = pr.flat_map(produce_pairs,
                                 range(len(subject_ids)),
                                 workers=workers_production,
                                 maxsize=queue_production_score,
                                 on_start=produce_pairs_local_init_baked)

    #create stage to calculate disease-to-disease
    pipeline_stage = pr.map(calculate_pair,
                            pipeline_stage,
                            workers=workers_score,
                            maxsize=queue_score_result,
                            on_start=calculate_pairs_local_init_baked)

    #store in elasticsearch
    #this could be multi process, but just use a single for now
    store_in_elasticsearch(pipeline_stage, es, dry_run, workers_write,
                           queue_write, index, doc)
Exemplo n.º 2
0
def test_flat_map_square_workers(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = list(nums_py)

    nums_pl = pr.map(lambda x: x**2, nums)
    nums_pl = pr.flat_map(_generator, nums_pl, workers=3)
    nums_pl = list(nums_pl)

    assert sorted(nums_pl) == sorted(nums_py)
Exemplo n.º 3
0
def test_flat_map_square(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = list(nums_py)

    nums_pl = pr.map(lambda x: x**2, nums)
    nums_pl = pr.flat_map(_generator, nums_pl)
    nums_pl = list(nums_pl)

    assert nums_pl == nums_py
Exemplo n.º 4
0
def test_flat_map_square_filter_workers_pipe(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = cz.filter(lambda x: x > 1, nums_py)
    nums_py = list(nums_py)

    nums_pl = (nums
               | pr.map(lambda x: x**2)
               | pr.flat_map(_generator, workers=3)
               | pr.filter(lambda x: x > 1)
               | list)

    assert sorted(nums_pl) == sorted(nums_py)
Exemplo n.º 5
0
    def process_all(self, scoring_weights, is_direct_do_not_propagate,
                    datasources_to_datatypes, dry_run, num_workers_produce,
                    num_workers_score, max_queued_produce_to_score):

        lookup_data = LookUpDataRetriever(
            self.es,
            self.r_server,
            targets=[],
            data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET,
                        LookUpDataType.ECO, LookUpDataType.HPA)).lookup

        targets = list(self.es_query.get_all_target_ids_with_evidence_data())

        #setup elasticsearch
        if not dry_run:
            self.es_loader.create_new_index(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            self.es_loader.prepare_for_bulk_indexing(
                self.es_loader.get_versioned_index(
                    Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME))

        self.logger.info('setting up stages')

        #bake the arguments for the setup into function objects
        produce_evidence_local_init_baked = functools.partial(
            produce_evidence_local_init, self.es_hosts, scoring_weights,
            is_direct_do_not_propagate, datasources_to_datatypes)
        score_producer_local_init_baked = functools.partial(
            score_producer_local_init, self.es_hosts, self.redis_host,
            self.redis_port, lookup_data, datasources_to_datatypes, dry_run)

        #this doesn't need to be in the external config, since its so content light
        #as to be meaningless
        max_queued_score_out = 10000

        #pipeline stage for making the lists of the target/disease pairs and evidence
        pipeline_stage = pr.flat_map(
            produce_evidence,
            targets,
            workers=num_workers_produce,
            maxsize=max_queued_produce_to_score,
            on_start=produce_evidence_local_init_baked,
            on_done=produce_evidence_local_shutdown)

        #pipeline stage for scoring the evidence sets
        #includes writing to elasticsearch
        pipeline_stage = pr.each(score_producer,
                                 pipeline_stage,
                                 workers=num_workers_score,
                                 maxsize=max_queued_score_out,
                                 on_start=score_producer_local_init_baked,
                                 on_done=score_producer_local_shutdown)

        #loop over the end of the pipeline to make sure everything is finished
        self.logger.info('stages created, running scoring and writing')
        pr.run(pipeline_stage)
        self.logger.info('stages created, ran scoring and writing')

        #cleanup elasticsearch
        if not dry_run:
            self.logger.info('flushing data to index')
            self.es_loader.flush_all_and_wait(
                Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.es_loader.restore_after_bulk_indexing()
            self.logger.info('flushed data to index')

        self.logger.info("DONE")
Exemplo n.º 6
0
    def process_all(self, dry_run):

        # do not pass this es object to other processess, single process only!
        es = new_es_client(self.es_hosts)

        targets = self.get_targets(es)

        self.logger.info('setting up stages')

        #bake the arguments for the setup into function objects
        produce_evidence_local_init_baked = functools.partial(produce_evidence_local_init, 
            self.es_hosts, self.es_index_val_right,
            self.scoring_weights, self.is_direct_do_not_propagate, 
            self.datasources_to_datatypes)
        score_producer_local_init_baked = functools.partial(score_producer_local_init,
            self.datasources_to_datatypes, dry_run, self.es_hosts,
            self.es_index_gene, self.es_index_eco, self.es_index_hpa, self.es_index_efo)
        
        #pipeline stage for making the lists of the target/disease pairs and evidence
        pipeline_stage1 = pr.flat_map(produce_evidence, targets, 
            workers=self.workers_production,
            maxsize=self.queue_produce,
            on_start=produce_evidence_local_init_baked)

        #pipeline stage for scoring the evidence sets
        #includes writing to elasticsearch
        pipeline_stage2 = pr.map(score_producer, pipeline_stage1, 
            workers=self.workers_score,
            maxsize=self.queue_score,
            on_start=score_producer_local_init_baked)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
            #load into elasticsearch
            self.logger.info('stages created, running scoring and writing')
            client = es
            chunk_size = 1000 #TODO make configurable
            actions = self.elasticsearch_actions(pipeline_stage2, 
                self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    self.logger.debug("Using parallel bulk writer for Elasticearch")
                    results = elasticsearch.helpers.parallel_bulk(client, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    self.logger.debug("Using streaming bulk writer for Elasticearch")
                    results = elasticsearch.helpers.streaming_bulk(client, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)

        self.logger.info("DONE")