Пример #1
0
    def prepare(self):
        if self.test:
            self.process_batch_size = 1000
            logging.warning("Batch size restricted to "
                            f"{self.process_batch_size}"
                            " while in test mode")

        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 self.drop_and_recreate,
                                 dataset='nih',
                                 aliases='health_scanner',
                                 increment_version=True)

        # Count articles from the old index
        _old_config = es_config.copy()
        _old_config['index'] = es_config['old_index']
        logging.info(f"Collected article IDs...")
        _ids = get_es_ids(es, _old_config, size=10000)
        logging.info(f"Collected {len(_ids)} IDs")
        done_ids = get_es_ids(es, es_config, size=10000)

        # Generate the job params
        job_params = []
        batches = split_batches(_ids, self.process_batch_size)
        for count, batch in enumerate(batches, 1):
            # Magical '0.3' is the lower end of the deduplication
            # fraction found by inspection
            done = sum(_id in done_ids for _id in batch) / len(batch) > 0.3
            # write batch of ids to s3
            batch_file = ''
            if not done:
                batch_file = put_s3_batch(batch, self.intermediate_bucket,
                                          self.routine_id)
            params = {
                "batch_file": batch_file,
                "config": 'mysqldb.config',
                "bucket": self.intermediate_bucket,
                "done": done,
                'outinfo': es_config['host'],
                'out_port': es_config['port'],
                'out_index': es_config['index'],
                'in_index': es_config['old_index'],
                'out_type': es_config['type'],
                'aws_auth_region': es_config['region'],
                'entity_type': 'paper',
                'test': self.test,
                'routine_id': self.routine_id
            }

            job_params.append(params)
            if self.test and count > 1:
                logging.warning("Breaking after 2 batches "
                                "while in test mode.")
                logging.warning(job_params)
                break
        logging.info("Batch preparation completed, "
                     f"with {len(job_params)} batches")
        return job_params
Пример #2
0
    def prepare(self):
        '''Chunk up elasticsearch data, and submit batch
        jobs over those chunks.'''
        if self.test:
            self.process_batch_size = 1000
            logging.warning("Batch size restricted to "
                            f"{self.process_batch_size}"
                            " while in test mode")

        # Setup elasticsearch and extract all ids
        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 drop_and_recreate=False,
                                 dataset=self.dataset,
                                 increment_version=False)
        ids = get_es_ids(es, es_config, size=10000)  # All ids in this index
        ids = ids - self._done_ids  # Don't repeat done ids

        # Override the default index if specified
        es_config['index'] = (self.index if self.index is not None else
                              es_config['index'])

        # Generate the job params
        job_params = []
        batches = split_batches(ids, self.process_batch_size)
        for count, batch in enumerate(batches, 1):
            done = False  # Already taken care of with _done_ids
            # write batch of ids to s3
            batch_file = ''
            if not done:
                batch_file = put_s3_batch(batch, self.intermediate_bucket,
                                          self.routine_id)
            params = {
                "batch_file": batch_file,
                "config": self.sql_config_filename,
                "bucket": self.intermediate_bucket,
                "done": done,
                "count": len(ids),
                'outinfo': es_config['host'],
                'out_port': es_config['port'],
                'index': es_config['index'],
                'out_type': es_config['type'],
                'aws_auth_region': es_config['region'],
                'test': self.test,
                'routine_id': self.routine_id,
                'entity_type': self.entity_type,
                **self.kwargs
            }
            job_params.append(params)
            # Test mode
            if self.test and count > 1:
                logging.warning("Breaking after 2 batches "
                                "while in test mode.")
                logging.warning(job_params)
                break
        # Done
        logging.info("Batch preparation completed, "
                     f"with {len(job_params)} batches")
        return job_params
Пример #3
0
 def done_ids(self):
     es_mode = 'dev' if self.test else 'prod'
     es, es_config = setup_es(es_mode, self.test,
                              drop_and_recreate=False,
                              dataset=self.dataset,
                              increment_version=False)
     field =  "terms_tokens_article"
     ids = get_es_ids(es, es_config, size=10000,
                      query={"query": {"exists": {"field" : field}}})
     return ids
Пример #4
0
def subset_keys(es, es_config, keys):
    all_idxs = get_es_ids(es, es_config)
    _keys = set()
    for key in keys:
        if key in _keys:
            continue
        first_idx, last_idx = split_mesh_file_key(key)
        for idx in all_idxs:
            if (int(idx) >= int(first_idx) and int(idx) <= int(last_idx)):
                _keys.add(key)
                break
    return _keys
Пример #5
0
def run():
    bucket = os.environ["BATCHPAR_s3_bucket"]
    abstract_file = os.environ["BATCHPAR_s3_key"]
    dupe_file = os.environ["BATCHPAR_dupe_file"]
    es_config = literal_eval(os.environ["BATCHPAR_outinfo"])
    db = os.environ["BATCHPAR_db"]
    entity_type = os.environ["BATCHPAR_entity_type"]

    # mysql setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db)
    Session = sessionmaker(bind=engine)
    session = Session()

    # retrieve a batch of meshed terms
    mesh_terms = retrieve_mesh_terms(bucket, abstract_file)
    mesh_terms = format_mesh_terms(mesh_terms)
    logging.info(f'batch {abstract_file} contains '
                 f'{len(mesh_terms)} meshed abstracts')

    # retrieve duplicate map
    dupes = retrieve_duplicate_map(bucket, dupe_file)
    dupes = format_duplicate_map(dupes)

    # Set up elastic search connection
    field_null_mapping = load_json_from_pathstub(
        "tier_1/"
        "field_null_mappings/", "health_scanner.json")
    es = ElasticsearchPlus(hosts=es_config['host'],
                           port=es_config['port'],
                           aws_auth_region=es_config['region'],
                           use_ssl=True,
                           entity_type=entity_type,
                           strans_kwargs=None,
                           field_null_mapping=field_null_mapping,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           country_detection=True,
                           listify_terms=True)
    all_es_ids = get_es_ids(es, es_config)

    docs = []
    for doc_id, terms in mesh_terms.items():
        if doc_id not in all_es_ids:
            continue
        try:
            _filter = Abstracts.application_id == doc_id
            abstract = (session.query(Abstracts).filter(_filter).one())
        except NoResultFound:
            logging.warning(f'Not found {doc_id} in database')
            raise NoResultFound(doc_id)
        clean_abstract_text = clean_abstract(abstract.abstract_text)
        docs.append({
            'doc_id': doc_id,
            'terms_mesh_abstract': terms,
            'textBody_abstract_project': clean_abstract_text
        })
        duped_docs = dupes.get(doc_id, [])
        if len(duped_docs) > 0:
            logging.info(f'Found {len(duped_docs)} duplicates')
        for duped_doc in duped_docs:
            docs.append({
                'doc_id': duped_doc,
                'terms_mesh_abstract': terms,
                'textBody_abstract_project': clean_abstract_text,
                'booleanFlag_duplicate_abstract': True
            })

    # output to elasticsearch
    logging.warning(f'Writing {len(docs)} documents to elasticsearch')
    for doc in docs:
        uid = doc.pop("doc_id")
        # Extract existing info
        existing = es.get(es_config['index'],
                          doc_type=es_config['type'],
                          id=uid)['_source']
        # Merge existing info into new doc
        doc = {**existing, **doc}
        es.index(index=es_config['index'],
                 doc_type=es_config['type'],
                 id=uid,
                 body=doc)
Пример #6
0
def test_get_es_ids(mocked_scan):
    ids = get_es_ids(mock.MagicMock(), mock.MagicMock())
    assert ids == {1, 22.3, 3.3}
Пример #7
0
    def prepare(self):
        if self.test:
            self.process_batch_size = 1000
            logging.warning("Batch size restricted to "
                            f"{self.process_batch_size}"
                            " while in test mode")

        # MySQL setup
        database = 'dev' if self.test else 'production'
        engine = get_mysql_engine(self.db_config_env, self.db_section,
                                  database)

        # Elasticsearch setup
        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 self.drop_and_recreate,
                                 dataset=self.dataset,
                                 aliases=self.aliases)

        # Get set of existing ids from elasticsearch via scroll
        existing_ids = get_es_ids(es, es_config)
        logging.info(f"Collected {len(existing_ids)} existing in "
                     "Elasticsearch")

        # Get set of all organisations from mysql
        with db_session(engine) as session:
            result = session.query(self.id_field).all()
            all_ids = {r[0] for r in result}
        logging.info(f"{len(all_ids)} organisations in MySQL")

        # Remove previously processed
        ids_to_process = (org for org in all_ids if org not in existing_ids)

        job_params = []
        for count, batch in enumerate(
                split_batches(ids_to_process, self.process_batch_size), 1):
            # write batch of ids to s3
            batch_file = put_s3_batch(batch, self.intermediate_bucket,
                                      self.routine_id)
            params = {
                "batch_file": batch_file,
                "config": 'mysqldb.config',
                "db_name": database,
                "bucket": self.intermediate_bucket,
                "done": False,
                'outinfo': es_config['host'],
                'out_port': es_config['port'],
                'out_index': es_config['index'],
                'out_type': es_config['type'],
                'aws_auth_region': es_config['region'],
                'entity_type': self.entity_type,
                'test': self.test,
                'routine_id': self.routine_id
            }
            params.update(self.kwargs)

            logging.info(params)
            job_params.append(params)
            if self.test and count > 1:
                logging.warning("Breaking after 2 batches while in "
                                "test mode.")
                logging.warning(job_params)
                break

        logging.warning("Batch preparation completed, "
                        f"with {len(job_params)} batches")
        return job_params