def handle_pairs(type, subject_labels, subject_data, subject_ids, other_ids, threshold, buckets_number, es, dry_run, workers_production, workers_score, workers_write, queue_production_score, queue_score_result, queue_write, index, doc): #do some initial setup vectorizer = DictVectorizer(sparse=True) tdidf_transformer = LocalTfidfTransformer(smooth_idf=False, ) data_vector = vectorizer.fit_transform( [subject_data[i] for i in subject_ids]) data_vector = data_vector > 0 data_vector = data_vector.astype(int) transformed_data = tdidf_transformer.fit_transform(data_vector) sums_vector = np.squeeze(np.asarray( transformed_data.sum(1)).ravel()) #sum by row '''put vectors in buckets''' buckets = {} for i in range(buckets_number): buckets[i] = [] vector_hashes = {} for i in range(len(subject_ids)): vector = transformed_data[i].toarray()[0] digested = digest_in_buckets(vector, buckets_number) for bucket in digested: buckets[bucket].append(i) vector_hashes[i] = digested idf = dict(zip(vectorizer.feature_names_, list(tdidf_transformer.idf_))) idf_ = 1 - tdidf_transformer.idf_ #now everything is computed that can be baked into the function arguments produce_pairs_local_init_baked = functools.partial( produce_pairs_local_init, vector_hashes, buckets, threshold, sums_vector, data_vector) calculate_pairs_local_init_baked = functools.partial( calculate_pairs_local_init, type, subject_labels, subject_ids, other_ids, threshold, idf, idf_) #create stage for producing disease-to-disease pipeline_stage = pr.flat_map(produce_pairs, range(len(subject_ids)), workers=workers_production, maxsize=queue_production_score, on_start=produce_pairs_local_init_baked) #create stage to calculate disease-to-disease pipeline_stage = pr.map(calculate_pair, pipeline_stage, workers=workers_score, maxsize=queue_score_result, on_start=calculate_pairs_local_init_baked) #store in elasticsearch #this could be multi process, but just use a single for now store_in_elasticsearch(pipeline_stage, es, dry_run, workers_write, queue_write, index, doc)
def test_flat_map_square_workers(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = list(nums_py) nums_pl = pr.map(lambda x: x**2, nums) nums_pl = pr.flat_map(_generator, nums_pl, workers=3) nums_pl = list(nums_pl) assert sorted(nums_pl) == sorted(nums_py)
def test_flat_map_square(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = list(nums_py) nums_pl = pr.map(lambda x: x**2, nums) nums_pl = pr.flat_map(_generator, nums_pl) nums_pl = list(nums_pl) assert nums_pl == nums_py
def test_flat_map_square_filter_workers_pipe(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = cz.filter(lambda x: x > 1, nums_py) nums_py = list(nums_py) nums_pl = (nums | pr.map(lambda x: x**2) | pr.flat_map(_generator, workers=3) | pr.filter(lambda x: x > 1) | list) assert sorted(nums_pl) == sorted(nums_py)
def process_all(self, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes, dry_run, num_workers_produce, num_workers_score, max_queued_produce_to_score): lookup_data = LookUpDataRetriever( self.es, self.r_server, targets=[], data_types=(LookUpDataType.DISEASE, LookUpDataType.TARGET, LookUpDataType.ECO, LookUpDataType.HPA)).lookup targets = list(self.es_query.get_all_target_ids_with_evidence_data()) #setup elasticsearch if not dry_run: self.es_loader.create_new_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) self.es_loader.prepare_for_bulk_indexing( self.es_loader.get_versioned_index( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME)) self.logger.info('setting up stages') #bake the arguments for the setup into function objects produce_evidence_local_init_baked = functools.partial( produce_evidence_local_init, self.es_hosts, scoring_weights, is_direct_do_not_propagate, datasources_to_datatypes) score_producer_local_init_baked = functools.partial( score_producer_local_init, self.es_hosts, self.redis_host, self.redis_port, lookup_data, datasources_to_datatypes, dry_run) #this doesn't need to be in the external config, since its so content light #as to be meaningless max_queued_score_out = 10000 #pipeline stage for making the lists of the target/disease pairs and evidence pipeline_stage = pr.flat_map( produce_evidence, targets, workers=num_workers_produce, maxsize=max_queued_produce_to_score, on_start=produce_evidence_local_init_baked, on_done=produce_evidence_local_shutdown) #pipeline stage for scoring the evidence sets #includes writing to elasticsearch pipeline_stage = pr.each(score_producer, pipeline_stage, workers=num_workers_score, maxsize=max_queued_score_out, on_start=score_producer_local_init_baked, on_done=score_producer_local_shutdown) #loop over the end of the pipeline to make sure everything is finished self.logger.info('stages created, running scoring and writing') pr.run(pipeline_stage) self.logger.info('stages created, ran scoring and writing') #cleanup elasticsearch if not dry_run: self.logger.info('flushing data to index') self.es_loader.flush_all_and_wait( Const.ELASTICSEARCH_DATA_ASSOCIATION_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.es_loader.restore_after_bulk_indexing() self.logger.info('flushed data to index') self.logger.info("DONE")
def process_all(self, dry_run): # do not pass this es object to other processess, single process only! es = new_es_client(self.es_hosts) targets = self.get_targets(es) self.logger.info('setting up stages') #bake the arguments for the setup into function objects produce_evidence_local_init_baked = functools.partial(produce_evidence_local_init, self.es_hosts, self.es_index_val_right, self.scoring_weights, self.is_direct_do_not_propagate, self.datasources_to_datatypes) score_producer_local_init_baked = functools.partial(score_producer_local_init, self.datasources_to_datatypes, dry_run, self.es_hosts, self.es_index_gene, self.es_index_eco, self.es_index_hpa, self.es_index_efo) #pipeline stage for making the lists of the target/disease pairs and evidence pipeline_stage1 = pr.flat_map(produce_evidence, targets, workers=self.workers_production, maxsize=self.queue_produce, on_start=produce_evidence_local_init_baked) #pipeline stage for scoring the evidence sets #includes writing to elasticsearch pipeline_stage2 = pr.map(score_producer, pipeline_stage1, workers=self.workers_score, maxsize=self.queue_score, on_start=score_producer_local_init_baked) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #load into elasticsearch self.logger.info('stages created, running scoring and writing') client = es chunk_size = 1000 #TODO make configurable actions = self.elasticsearch_actions(pipeline_stage2, self.es_index, self.es_doc) failcount = 0 if not dry_run: results = None if self.workers_write > 0: self.logger.debug("Using parallel bulk writer for Elasticearch") results = elasticsearch.helpers.parallel_bulk(client, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: self.logger.debug("Using streaming bulk writer for Elasticearch") results = elasticsearch.helpers.streaming_bulk(client, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount) self.logger.info("DONE")