예제 #1
0
def test_setup_es_bad_es_mode(mock_get_es_mapping, mock_Elasticsearch,
                              mock_assert_correct_config, mock_get_config):
    with pytest.raises(ValueError):
        setup_es(es_mode="dave",
                 test_mode=False,
                 drop_and_recreate=False,
                 dataset=None,
                 aliases=None)
예제 #2
0
def test_setup_es_no_create_if_exists(mock_get_es_mapping, mock_Elasticsearch,
                                      mock_assert_correct_config,
                                      mock_get_config):
    mock_Elasticsearch.return_value.indices.exists.return_value = True
    setup_es(es_mode="dev",
             test_mode=True,
             drop_and_recreate=False,
             dataset=None,
             aliases=None)
    assert mock_Elasticsearch.return_value.indices.delete.call_count == 0
    assert mock_Elasticsearch.return_value.indices.create.call_count == 0
예제 #3
0
    def prepare(self):
        if self.test:
            self.process_batch_size = 1000
            logging.warning("Batch size restricted to "
                            f"{self.process_batch_size}"
                            " while in test mode")

        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 self.drop_and_recreate,
                                 dataset='nih',
                                 aliases='health_scanner',
                                 increment_version=True)

        # Count articles from the old index
        _old_config = es_config.copy()
        _old_config['index'] = es_config['old_index']
        logging.info(f"Collected article IDs...")
        _ids = get_es_ids(es, _old_config, size=10000)
        logging.info(f"Collected {len(_ids)} IDs")
        done_ids = get_es_ids(es, es_config, size=10000)

        # Generate the job params
        job_params = []
        batches = split_batches(_ids, self.process_batch_size)
        for count, batch in enumerate(batches, 1):
            # Magical '0.3' is the lower end of the deduplication
            # fraction found by inspection
            done = sum(_id in done_ids for _id in batch) / len(batch) > 0.3
            # write batch of ids to s3
            batch_file = ''
            if not done:
                batch_file = put_s3_batch(batch, self.intermediate_bucket,
                                          self.routine_id)
            params = {
                "batch_file": batch_file,
                "config": 'mysqldb.config',
                "bucket": self.intermediate_bucket,
                "done": done,
                'outinfo': es_config['host'],
                'out_port': es_config['port'],
                'out_index': es_config['index'],
                'in_index': es_config['old_index'],
                'out_type': es_config['type'],
                'aws_auth_region': es_config['region'],
                'entity_type': 'paper',
                'test': self.test,
                'routine_id': self.routine_id
            }

            job_params.append(params)
            if self.test and count > 1:
                logging.warning("Breaking after 2 batches "
                                "while in test mode.")
                logging.warning(job_params)
                break
        logging.info("Batch preparation completed, "
                     f"with {len(job_params)} batches")
        return job_params
예제 #4
0
파일: estask.py 프로젝트: yitzikc/nesta
    def prepare(self):
        '''Chunk up elasticsearch data, and submit batch
        jobs over those chunks.'''
        if self.test:
            self.process_batch_size = 1000
            logging.warning("Batch size restricted to "
                            f"{self.process_batch_size}"
                            " while in test mode")

        # Setup elasticsearch and extract all ids
        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 drop_and_recreate=False,
                                 dataset=self.dataset,
                                 increment_version=False)
        ids = get_es_ids(es, es_config, size=10000)  # All ids in this index
        ids = ids - self._done_ids  # Don't repeat done ids

        # Override the default index if specified
        es_config['index'] = (self.index if self.index is not None else
                              es_config['index'])

        # Generate the job params
        job_params = []
        batches = split_batches(ids, self.process_batch_size)
        for count, batch in enumerate(batches, 1):
            done = False  # Already taken care of with _done_ids
            # write batch of ids to s3
            batch_file = ''
            if not done:
                batch_file = put_s3_batch(batch, self.intermediate_bucket,
                                          self.routine_id)
            params = {
                "batch_file": batch_file,
                "config": self.sql_config_filename,
                "bucket": self.intermediate_bucket,
                "done": done,
                "count": len(ids),
                'outinfo': es_config['host'],
                'out_port': es_config['port'],
                'index': es_config['index'],
                'out_type': es_config['type'],
                'aws_auth_region': es_config['region'],
                'test': self.test,
                'routine_id': self.routine_id,
                'entity_type': self.entity_type,
                **self.kwargs
            }
            job_params.append(params)
            # Test mode
            if self.test and count > 1:
                logging.warning("Breaking after 2 batches "
                                "while in test mode.")
                logging.warning(job_params)
                break
        # Done
        logging.info("Batch preparation completed, "
                     f"with {len(job_params)} batches")
        return job_params
예제 #5
0
 def done_ids(self):
     es_mode = 'dev' if self.test else 'prod'
     es, es_config = setup_es(es_mode, self.test,
                              drop_and_recreate=False,
                              dataset=self.dataset,
                              increment_version=False)
     field =  "terms_tokens_article"
     ids = get_es_ids(es, es_config, size=10000,
                      query={"query": {"exists": {"field" : field}}})
     return ids
예제 #6
0
    def prepare(self):
        # mysql setup
        db = 'production' if not self.test else 'dev'
        engine = get_mysql_engine(MYSQLDB_ENV, "mysqldb", db)
        Session = sessionmaker(bind=engine)
        session = Session()
        project_query = session.query(Projects)

        # elasticsearch setup
        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 self.drop_and_recreate,
                                 dataset='nih',
                                 aliases='health_scanner')

        batches = self.batch_limits(project_query, BATCH_SIZE)
        job_params = []
        for start, end in batches:
            params = {
                'start_index':
                start,
                'end_index':
                end,
                'config':
                "mysqldb.config",
                'db':
                db,
                'outinfo':
                es_config['host'],
                'out_port':
                es_config['port'],
                'out_index':
                es_config['index'],
                'out_type':
                es_config['type'],
                'aws_auth_region':
                es_config['region'],
                'done':
                es.exists(index=es_config['index'],
                          doc_type=es_config['type'],
                          id=end),
                'aws_auth_region':
                es_config['region'],
                'entity_type':
                'paper'
            }
            job_params.append(params)
        return job_params
예제 #7
0
    def prepare(self):
        # mysql setup
        db = 'production' if not self.test else 'dev'

        # elasticsearch setup
        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 drop_and_recreate=False,
                                 dataset='nih',
                                 aliases='health_scanner')

        # s3 setup and file key collection
        bucket = 'innovation-mapping-general'
        key_prefix = 'nih_abstracts_processed/22-07-2019/nih_'
        keys = self.get_abstract_file_keys(bucket, key_prefix)
        logging.info(f"Found keys: {keys}")

        # In test mode, manually filter keys for those which
        # contain our data
        if self.test:
            keys = subset_keys(es, es_config, keys)

        job_params = []
        for key in keys:
            done = ((not self.test)
                    and self.done_check(es,
                                        index=es_config['index'],
                                        doc_type=es_config['type'],
                                        key=key))
            params = {
                's3_key': key,
                's3_bucket': bucket,
                'dupe_file': ("nih_abstracts/24-05-19/"
                              "duplicate_mapping.json"),
                'config': "mysqldb.config",
                'db': db,
                'outinfo': es_config,
                'done': done,
                'entity_type': 'paper'
            }
            logging.info(params)
            job_params.append(params)
        return job_params
예제 #8
0
    def requires(self):
        '''Collects the database configurations
        and executes the central task.'''
        _routine_id = "{}-{}".format(self.date, self.production)
        grid_task_kwargs = {
            '_routine_id': _routine_id,
            'db_config_path': self.db_config_path,
            'db_config_env': 'MYSQLDB',
            'mag_config_path': 'mag.config',
            'test': not self.production,
            'insert_batch_size': self.insert_batch_size,
            'articles_from_date': self.articles_from_date,
            'date': self.date,
        }

        cherry_picked = (f'automl/{self.date}/COREX_TOPIC_MODEL'
                         '.n_hidden_27-0.VECTORIZER.binary_True'
                         f'.min_df_0-001.NGRAM.TEST_False.json')
        if not self.production:
            cherry_picked = (f'automl/{self.date}/COREX_TOPIC_MODEL'
                             '.n_hidden_36-0.VECTORIZER.binary_True'
                             '.min_df_0-001.NGRAM.TEST_True.json')

        kwargs = {
            'score_field': 'metric_novelty_article',
            'fields': ['textBody_abstract_article']
        }
        test = not self.production
        routine_id = f"ArxivLolveltyTask-{self.date}-{test}"

        # Elasticsearch setup
        dataset = 'arxiv'
        _, es_config = setup_es('prod' if self.production else 'dev',
                                not self.production,
                                self.drop_and_recreate,
                                dataset=dataset)
        yield ArxivElasticsearchTask(date=self.date,
                                     routine_id=routine_id,
                                     grid_task_kwargs=grid_task_kwargs,
                                     test=not self.production,
                                     index=es_config['index'],
                                     dataset='arxiv',
                                     entity_type='article',
                                     kwargs=kwargs,
                                     batchable=f3p("batchables/novelty"
                                                   "/lolvelty"),
                                     env_files=[
                                         f3p("nesta/"),
                                         f3p("config/mysqldb.config"),
                                         f3p("config/"
                                             "elasticsearch.config")
                                     ],
                                     job_def="py36_amzn1_image",
                                     job_name=routine_id,
                                     job_queue="HighPriority",
                                     region_name="eu-west-2",
                                     poll_time=10,
                                     memory=1024,
                                     max_live_jobs=30)

        yield AnalysisTask(date=self.date,
                           grid_task_kwargs=grid_task_kwargs,
                           _routine_id=_routine_id,
                           db_config_path=self.db_config_path,
                           db_config_env='MYSQLDB',
                           mag_config_path='mag.config',
                           test=not self.production,
                           insert_batch_size=self.insert_batch_size,
                           articles_from_date=self.articles_from_date,
                           cherry_picked=cherry_picked)
예제 #9
0
    logging.warning("Batch job complete.")


if __name__ == "__main__":
    log_stream_handler = logging.StreamHandler()
    logging.basicConfig(handlers=[
        log_stream_handler,
    ],
                        level=logging.INFO,
                        format="%(asctime)s:%(levelname)s:%(message)s")

    if 'BATCHPAR_outinfo' not in os.environ:
        from nesta.core.orms.orm_utils import setup_es
        es, es_config = setup_es('dev',
                                 True,
                                 True,
                                 dataset='crunchbase',
                                 aliases='health_scanner')

        environ = {
            "AWSBATCHTEST":
            "",
            'BATCHPAR_batch_file':
            'crunchbase_to_es-15597291977144725.json',
            'BATCHPAR_config': ('/home/ec2-user/nesta/nesta/'
                                'core/config/mysqldb.config'),
            'BATCHPAR_db_name':
            'production',
            'BATCHPAR_bucket':
            'nesta-production-intermediate',
            'BATCHPAR_done':
예제 #10
0
파일: run.py 프로젝트: hmessafi/nesta
                    grid_institutes[g].title() for g in good_institutes
                ]

            uid = row.pop('id')
            _row = es.index(index=es_index, doc_type=es_type, id=uid, body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to " "elasticsearch")

    logging.warning("Batch job complete.")


if __name__ == "__main__":
    set_log_level()
    if 'BATCHPAR_outinfo' not in os.environ:
        from nesta.core.orms.orm_utils import setup_es
        es, es_config = setup_es('dev', True, True, dataset='arxiv-eu')
        environ = {
            'config': ('/home/ec2-user/nesta-eu/nesta/'
                       'core/config/mysqldb.config'),
            'batch_file': ('arxiv-eu_EURITO-ElasticsearchTask-'
                           '2019-10-12-True-157124660046601.json'),
            'db_name':
            'dev',
            'bucket':
            'nesta-production-intermediate',
            'done':
            "False",
            'outinfo': ('https://search-eurito-dev-'
                        'vq22tw6otqjpdh47u75bh2g7ba.'
                        'eu-west-2.es.amazonaws.com'),
            'out_port':
    def prepare(self):
        if self.test:
            self.process_batch_size = 1000
            logging.warning("Batch size restricted to "
                            f"{self.process_batch_size}"
                            " while in test mode")

        # MySQL setup
        self.database = 'dev' if self.test else 'production'
        engine = get_mysql_engine(self.db_config_env, 'mysqldb', self.database)

        # Elasticsearch setup
        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 self.drop_and_recreate,
                                 dataset='crunchbase',
                                 aliases='health_scanner')

        # Get set of existing ids from elasticsearch via scroll
        scanner = scan(es,
                       query={"_source": False},
                       index=es_config['index'],
                       doc_type=es_config['type'])
        existing_ids = {s['_id'] for s in scanner}
        logging.info(f"Collected {len(existing_ids)} existing in "
                     "Elasticsearch")

        # Get set of all organisations from mysql
        all_orgs = list(all_org_ids(engine))
        logging.info(f"{len(all_orgs)} organisations in MySQL")

        # Remove previously processed
        orgs_to_process = list(org for org in all_orgs
                               if org not in existing_ids)
        logging.info(f"{len(orgs_to_process)} to be processed")

        job_params = []
        for count, batch in enumerate(
                split_batches(orgs_to_process, self.process_batch_size), 1):
            logging.info(f"Processing batch {count} with size {len(batch)}")

            # write batch of ids to s3
            batch_file = put_s3_batch(batch, self.intermediate_bucket,
                                      'crunchbase_to_es')
            params = {
                "batch_file": batch_file,
                "config": 'mysqldb.config',
                "db_name": self.database,
                "bucket": self.intermediate_bucket,
                "done": False,
                'outinfo': es_config['host'],
                'out_port': es_config['port'],
                'out_index': es_config['index'],
                'out_type': es_config['type'],
                'aws_auth_region': es_config['region'],
                'entity_type': 'company',
                "test": self.test
            }

            logging.info(params)
            job_params.append(params)
            if self.test and count > 1:
                logging.warning("Breaking after 2 batches while in "
                                "test mode.")
                break

        logging.warning("Batch preparation completed, "
                        f"with {len(job_params)} batches")
        return job_params
예제 #12
0
파일: run.py 프로젝트: yitzikc/nesta
            row = object_to_dict(obj)
            row = reformat_row(row)
            es.index(index=es_index,
                     doc_type=es_type,
                     id=row.pop('rcn'),
                     body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to " "elasticsearch")


if __name__ == "__main__":
    set_log_level()
    if 'BATCHPAR_outinfo' not in os.environ:
        from nesta.core.orms.orm_utils import setup_es
        from nesta.core.luigihacks.misctools import find_filepath_from_pathstub
        es, es_config = setup_es('dev', True, True, dataset='cordis-eu')
        environ = {
            'config':
            find_filepath_from_pathstub('mysqldb.config'),
            'batch_file': ('cordis-eu_EURITO-ElasticsearchTask-'
                           '2020-04-10-True-15865345336407135.json'),
            'db_name':
            'dev',
            'bucket':
            'nesta-production-intermediate',
            'outinfo':
            es_config['host'],
            'out_port':
            es_config['port'],
            'out_index':
            es_config['index'],
예제 #13
0
파일: sql2estask.py 프로젝트: yitzikc/nesta
    def prepare(self):
        if self.test:
            self.process_batch_size = 1000
            logging.warning("Batch size restricted to "
                            f"{self.process_batch_size}"
                            " while in test mode")

        # MySQL setup
        database = 'dev' if self.test else 'production'
        engine = get_mysql_engine(self.db_config_env, self.db_section,
                                  database)

        # Elasticsearch setup
        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 self.drop_and_recreate,
                                 dataset=self.dataset,
                                 aliases=self.aliases)

        # Get set of existing ids from elasticsearch via scroll
        existing_ids = get_es_ids(es, es_config)
        logging.info(f"Collected {len(existing_ids)} existing in "
                     "Elasticsearch")

        # Get set of all organisations from mysql
        with db_session(engine) as session:
            result = session.query(self.id_field).all()
            all_ids = {r[0] for r in result}
        logging.info(f"{len(all_ids)} organisations in MySQL")

        # Remove previously processed
        ids_to_process = (org for org in all_ids if org not in existing_ids)

        job_params = []
        for count, batch in enumerate(
                split_batches(ids_to_process, self.process_batch_size), 1):
            # write batch of ids to s3
            batch_file = put_s3_batch(batch, self.intermediate_bucket,
                                      self.routine_id)
            params = {
                "batch_file": batch_file,
                "config": 'mysqldb.config',
                "db_name": database,
                "bucket": self.intermediate_bucket,
                "done": False,
                'outinfo': es_config['host'],
                'out_port': es_config['port'],
                'out_index': es_config['index'],
                'out_type': es_config['type'],
                'aws_auth_region': es_config['region'],
                'entity_type': self.entity_type,
                'test': self.test,
                'routine_id': self.routine_id
            }
            params.update(self.kwargs)

            logging.info(params)
            job_params.append(params)
            if self.test and count > 1:
                logging.warning("Breaking after 2 batches while in "
                                "test mode.")
                logging.warning(job_params)
                break

        logging.warning("Batch preparation completed, "
                        f"with {len(job_params)} batches")
        return job_params
예제 #14
0
        es.index(index=es_index, 
                 doc_type=es_type, id=uid, body=row)

    # Also upload the data to S3
    silo.put(data, dataset)


if __name__ == "__main__":
    log_stream_handler = logging.StreamHandler()
    logging.basicConfig(handlers=[log_stream_handler, ],
                        level=logging.INFO,
                        format="%(asctime)s:%(levelname)s:%(message)s")

    if 'BATCHPAR_outinfo' not in os.environ:
        es, es_config = setup_es(es_mode="dev", test_mode=True, 
                                 drop_and_recreate=True, 
                                 dataset='example', 
                                 aliases='example')

        #environ = {"AWSBATCHTEST": "",  ## << This means don't write to ES
        environ = {"BATCHPAR_aws_auth_region": es_config["region"],
                   "BATCHPAR_outinfo": es_config["host"],
                   "BATCHPAR_dataset" : "example",
                   "BATCHPAR_done":"False",
                   "BATCHPAR_age_increment": "-3",
                   "BATCHPAR_start_index":"0",
                   "BATCHPAR_end_index":"3",
                   "BATCHPAR_out_type": es_config["type"],
                   "BATCHPAR_out_port": es_config["port"],
                   "BATCHPAR_out_index":es_config["index"],
                   "BATCHPAR_entity_type":"muppet"}
        for k, v in environ.items():