def prepare(self): if self.test: self.process_batch_size = 1000 logging.warning("Batch size restricted to " f"{self.process_batch_size}" " while in test mode") es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, dataset='nih', aliases='health_scanner', increment_version=True) # Count articles from the old index _old_config = es_config.copy() _old_config['index'] = es_config['old_index'] logging.info(f"Collected article IDs...") _ids = get_es_ids(es, _old_config, size=10000) logging.info(f"Collected {len(_ids)} IDs") done_ids = get_es_ids(es, es_config, size=10000) # Generate the job params job_params = [] batches = split_batches(_ids, self.process_batch_size) for count, batch in enumerate(batches, 1): # Magical '0.3' is the lower end of the deduplication # fraction found by inspection done = sum(_id in done_ids for _id in batch) / len(batch) > 0.3 # write batch of ids to s3 batch_file = '' if not done: batch_file = put_s3_batch(batch, self.intermediate_bucket, self.routine_id) params = { "batch_file": batch_file, "config": 'mysqldb.config', "bucket": self.intermediate_bucket, "done": done, 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], 'in_index': es_config['old_index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'entity_type': 'paper', 'test': self.test, 'routine_id': self.routine_id } job_params.append(params) if self.test and count > 1: logging.warning("Breaking after 2 batches " "while in test mode.") logging.warning(job_params) break logging.info("Batch preparation completed, " f"with {len(job_params)} batches") return job_params
def prepare(self): '''Chunk up elasticsearch data, and submit batch jobs over those chunks.''' if self.test: self.process_batch_size = 1000 logging.warning("Batch size restricted to " f"{self.process_batch_size}" " while in test mode") # Setup elasticsearch and extract all ids es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, drop_and_recreate=False, dataset=self.dataset, increment_version=False) ids = get_es_ids(es, es_config, size=10000) # All ids in this index ids = ids - self._done_ids # Don't repeat done ids # Override the default index if specified es_config['index'] = (self.index if self.index is not None else es_config['index']) # Generate the job params job_params = [] batches = split_batches(ids, self.process_batch_size) for count, batch in enumerate(batches, 1): done = False # Already taken care of with _done_ids # write batch of ids to s3 batch_file = '' if not done: batch_file = put_s3_batch(batch, self.intermediate_bucket, self.routine_id) params = { "batch_file": batch_file, "config": self.sql_config_filename, "bucket": self.intermediate_bucket, "done": done, "count": len(ids), 'outinfo': es_config['host'], 'out_port': es_config['port'], 'index': es_config['index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'test': self.test, 'routine_id': self.routine_id, 'entity_type': self.entity_type, **self.kwargs } job_params.append(params) # Test mode if self.test and count > 1: logging.warning("Breaking after 2 batches " "while in test mode.") logging.warning(job_params) break # Done logging.info("Batch preparation completed, " f"with {len(job_params)} batches") return job_params
def done_ids(self): es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, drop_and_recreate=False, dataset=self.dataset, increment_version=False) field = "terms_tokens_article" ids = get_es_ids(es, es_config, size=10000, query={"query": {"exists": {"field" : field}}}) return ids
def subset_keys(es, es_config, keys): all_idxs = get_es_ids(es, es_config) _keys = set() for key in keys: if key in _keys: continue first_idx, last_idx = split_mesh_file_key(key) for idx in all_idxs: if (int(idx) >= int(first_idx) and int(idx) <= int(last_idx)): _keys.add(key) break return _keys
def run(): bucket = os.environ["BATCHPAR_s3_bucket"] abstract_file = os.environ["BATCHPAR_s3_key"] dupe_file = os.environ["BATCHPAR_dupe_file"] es_config = literal_eval(os.environ["BATCHPAR_outinfo"]) db = os.environ["BATCHPAR_db"] entity_type = os.environ["BATCHPAR_entity_type"] # mysql setup engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db) Session = sessionmaker(bind=engine) session = Session() # retrieve a batch of meshed terms mesh_terms = retrieve_mesh_terms(bucket, abstract_file) mesh_terms = format_mesh_terms(mesh_terms) logging.info(f'batch {abstract_file} contains ' f'{len(mesh_terms)} meshed abstracts') # retrieve duplicate map dupes = retrieve_duplicate_map(bucket, dupe_file) dupes = format_duplicate_map(dupes) # Set up elastic search connection field_null_mapping = load_json_from_pathstub( "tier_1/" "field_null_mappings/", "health_scanner.json") es = ElasticsearchPlus(hosts=es_config['host'], port=es_config['port'], aws_auth_region=es_config['region'], use_ssl=True, entity_type=entity_type, strans_kwargs=None, field_null_mapping=field_null_mapping, null_empty_str=True, coordinates_as_floats=True, country_detection=True, listify_terms=True) all_es_ids = get_es_ids(es, es_config) docs = [] for doc_id, terms in mesh_terms.items(): if doc_id not in all_es_ids: continue try: _filter = Abstracts.application_id == doc_id abstract = (session.query(Abstracts).filter(_filter).one()) except NoResultFound: logging.warning(f'Not found {doc_id} in database') raise NoResultFound(doc_id) clean_abstract_text = clean_abstract(abstract.abstract_text) docs.append({ 'doc_id': doc_id, 'terms_mesh_abstract': terms, 'textBody_abstract_project': clean_abstract_text }) duped_docs = dupes.get(doc_id, []) if len(duped_docs) > 0: logging.info(f'Found {len(duped_docs)} duplicates') for duped_doc in duped_docs: docs.append({ 'doc_id': duped_doc, 'terms_mesh_abstract': terms, 'textBody_abstract_project': clean_abstract_text, 'booleanFlag_duplicate_abstract': True }) # output to elasticsearch logging.warning(f'Writing {len(docs)} documents to elasticsearch') for doc in docs: uid = doc.pop("doc_id") # Extract existing info existing = es.get(es_config['index'], doc_type=es_config['type'], id=uid)['_source'] # Merge existing info into new doc doc = {**existing, **doc} es.index(index=es_config['index'], doc_type=es_config['type'], id=uid, body=doc)
def test_get_es_ids(mocked_scan): ids = get_es_ids(mock.MagicMock(), mock.MagicMock()) assert ids == {1, 22.3, 3.3}
def prepare(self): if self.test: self.process_batch_size = 1000 logging.warning("Batch size restricted to " f"{self.process_batch_size}" " while in test mode") # MySQL setup database = 'dev' if self.test else 'production' engine = get_mysql_engine(self.db_config_env, self.db_section, database) # Elasticsearch setup es_mode = 'dev' if self.test else 'prod' es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, dataset=self.dataset, aliases=self.aliases) # Get set of existing ids from elasticsearch via scroll existing_ids = get_es_ids(es, es_config) logging.info(f"Collected {len(existing_ids)} existing in " "Elasticsearch") # Get set of all organisations from mysql with db_session(engine) as session: result = session.query(self.id_field).all() all_ids = {r[0] for r in result} logging.info(f"{len(all_ids)} organisations in MySQL") # Remove previously processed ids_to_process = (org for org in all_ids if org not in existing_ids) job_params = [] for count, batch in enumerate( split_batches(ids_to_process, self.process_batch_size), 1): # write batch of ids to s3 batch_file = put_s3_batch(batch, self.intermediate_bucket, self.routine_id) params = { "batch_file": batch_file, "config": 'mysqldb.config', "db_name": database, "bucket": self.intermediate_bucket, "done": False, 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'entity_type': self.entity_type, 'test': self.test, 'routine_id': self.routine_id } params.update(self.kwargs) logging.info(params) job_params.append(params) if self.test and count > 1: logging.warning("Breaking after 2 batches while in " "test mode.") logging.warning(job_params) break logging.warning("Batch preparation completed, " f"with {len(job_params)} batches") return job_params