Exemplo n.º 1
0
 def requires(self):
     logging.getLogger().setLevel(logging.INFO)
     kwargs = {'score_field': 'rank_rhodonite_group',
               'fields': ['name_of_group', 'textBody_descriptive_group',
                          'terms_topics_group']}
     test = not self.production
     routine_id = f"MeetupLolveltyTask-{self.date}-{test}"
     index = self.index if self.production else 'meetup_dev'
     assert index is not None
     return LazyElasticsearchTask(routine_id=routine_id,
                                  test=test,
                                  index=index,
                                  dataset='meetup',
                                  entity_type='meetup',
                                  kwargs=kwargs,
                                  batchable=f3p("batchables/novelty/lolvelty"),
                                  env_files=[f3p("nesta/"),
                                             f3p("config/mysqldb.config"),
                                             f3p("config/elasticsearch.config")],
                                  job_def="py36_amzn1_image",
                                  job_name=routine_id,
                                  job_queue="HighPriority",
                                  region_name="eu-west-2",
                                  poll_time=10,
                                  memory=1024,
                                  max_live_jobs=10)
Exemplo n.º 2
0
    def requires(self):
        '''Collects the database configurations
        and executes the central task.'''
        _routine_id = "{}-{}".format(self.date, self.production)

        logging.getLogger().setLevel(logging.INFO)
        yield ProcessTask(
            date=self.date,
            drop_and_recreate=self.drop_and_recreate,
            _routine_id=_routine_id,
            db_config_path=self.db_config_path,
            batchable=f3p("batchables/health_data/nih_process_data"),
            env_files=[
                f3p("nesta/"),
                f3p("config/mysqldb.config"),
                f3p("config/elasticsearch.config"),
                f3p("nih.json")
            ],
            job_def="py36_amzn1_image",
            job_name="ProcessTask-%s" % _routine_id,
            job_queue="HighPriority",
            region_name="eu-west-2",
            poll_time=10,
            test=not self.production,
            memory=2048,
            max_live_jobs=2)
Exemplo n.º 3
0
 def requires(self):
     logging.getLogger().setLevel(logging.INFO)
     kwargs = {
         'score_field': 'metric_novelty_article',
         'fields': ['textBody_abstract_article']
     }
     test = not self.production
     routine_id = f"ArxivLolveltyTask-{self.date}-{test}"
     index = 'arxiv_v3' if self.production else 'arxiv_dev'
     return _ArxivElasticsearchTask(routine_id=routine_id,
                                    test=test,
                                    index=index,
                                    dataset='arxiv',
                                    entity_type='article',
                                    kwargs=kwargs,
                                    batchable=f3p("batchables/novelty"
                                                  "/lolvelty"),
                                    env_files=[
                                        f3p("nesta/"),
                                        f3p("config/mysqldb.config"),
                                        f3p("config/"
                                            "elasticsearch.config")
                                    ],
                                    job_def="py36_amzn1_image",
                                    job_name=routine_id,
                                    job_queue="HighPriority",
                                    region_name="eu-west-2",
                                    poll_time=10,
                                    memory=1024,
                                    max_live_jobs=30)
Exemplo n.º 4
0
    def requires(self):
        '''Collects the database configurations
        and executes the central task.'''
        _routine_id = "{}-{}".format(self.date, self.production)

        logging.getLogger().setLevel(logging.INFO)
        yield DedupeTask(date=self.date,
                         drop_and_recreate=self.drop_and_recreate,
                         routine_id=_routine_id,
                         db_config_path=self.db_config_path,
                         process_batch_size=5000,
                         intermediate_bucket='nesta-production-intermediate',
                         test=(not self.production),
                         batchable=f3p("batchables/health_data/"
                                       "nih_dedupe"),
                         env_files=[
                             f3p("nesta/"),
                             f3p("config/mysqldb.config"),
                             f3p("config/elasticsearch.config"),
                             f3p("nih.json")
                         ],
                         job_def="py36_amzn1_image",
                         job_name="NiHDedupeTask-%s" % _routine_id,
                         job_queue="HighPriority",
                         region_name="eu-west-2",
                         poll_time=10,
                         memory=1024,
                         max_live_jobs=20)
Exemplo n.º 5
0
 def requires(self):
     '''Collects the database configurations
     and executes the central task.'''
     routine_id = "ArxivESTask-{}-{}".format(self.date, self.production)
     logging.getLogger().setLevel(logging.INFO)
     yield Sql2EsTask(routine_id=routine_id,
                      date=self.date,
                      process_batch_size=10000,
                      drop_and_recreate=self.drop_and_recreate,
                      dataset='arxiv',
                      id_field=Article.id,
                      entity_type='article',
                      db_config_env='MYSQLDB',
                      test=not self.production,
                      intermediate_bucket=('nesta-production'
                                           '-intermediate'),
                      batchable=f3p('batchables/arxiv/'
                                    'arxiv_elasticsearch'),
                      env_files=[
                          f3p('nesta/'),
                          f3p('config/'
                              'mysqldb.config'),
                          f3p('schema_transformations/'
                              'arxiv.json'),
                          f3p('config/'
                              'elasticsearch.config')
                      ],
                      job_def='py36_amzn1_image',
                      job_name=routine_id,
                      job_queue='HighPriority',
                      region_name='eu-west-2',
                      memory=2048,
                      poll_time=10,
                      max_live_jobs=100)
Exemplo n.º 6
0
    def requires(self):
        '''Collects the database configurations and executes the central task.'''
        _routine_id = "{}-{}".format(self.date, self.production)

        logging.getLogger().setLevel(logging.INFO)
        yield CrunchbaseSql2EsTask(
            date=self.date,
            _routine_id=_routine_id,
            test=not self.production,
            drop_and_recreate=self.drop_and_recreate,
            db_config_env="MYSQLDB",
            insert_batch_size=self.insert_batch_size,
            process_batch_size=50000,
            intermediate_bucket='nesta-production-intermediate',
            batchable=f3p(
                "core/batchables/crunchbase/crunchbase_elasticsearch"),
            env_files=[
                f3p("nesta/"),
                f3p("config/mysqldb.config"),
                f3p("schema_transformations/crunchbase_organisation_members.json"
                    ),
                f3p("config/elasticsearch.config")
            ],
            job_def="py36_amzn1_image",
            job_name=f"CrunchBaseElasticsearchTask-{_routine_id}",
            job_queue="HighPriority",
            region_name="eu-west-2",
            poll_time=10,
            memory=2048,
            max_live_jobs=100)
Exemplo n.º 7
0
    def requires(self):
        yield GeocodeBatchTask(
            _routine_id=self.routine_id,
            test=self.test,
            test_limit=None,
            db_config_env=self.db_config_env,
            city_col=Group.city,
            country_col=Group.country,
            country_is_iso2=True,
            env_files=[f3p("nesta/"),
                       f3p("config/mysqldb.config")],
            job_def="py36_amzn1_image",
            job_name=f"HealthMeetupGeocodeBatchTask-{self.routine_id}",
            job_queue="HighPriority",
            region_name="eu-west-2",
            poll_time=10,
            memory=4096,
            max_live_jobs=2)

        yield TopicDiscoveryTask(routine_id=self.routine_id,
                                 core_categories=self.core_categories,
                                 members_perc=self.members_perc,
                                 topic_perc=self.topic_perc,
                                 db_config_env=self.db_config_env,
                                 test=self.test)
Exemplo n.º 8
0
 def requires(self):
     yield ArxivESTask(routine_id=self.routine_id,
                       date=self.date,
                       grid_task_kwargs=self.grid_task_kwargs,
                       process_batch_size=10000,
                       drop_and_recreate=self.drop_and_recreate,
                       dataset='arxiv',
                       id_field=Article.id,
                       entity_type='article',
                       db_config_env='MYSQLDB',
                       test=self.test,
                       intermediate_bucket=('nesta-production'
                                            '-intermediate'),
                       batchable=f3p('batchables/arxiv/'
                                     'arxiv_elasticsearch'),
                       env_files=[
                           f3p('nesta/'),
                           f3p('config/'
                               'mysqldb.config'),
                           f3p('schema_transformations/'
                               'arxiv.json'),
                           f3p('config/'
                               'elasticsearch.config')
                       ],
                       job_def='py36_amzn1_image',
                       job_name=self.routine_id,
                       job_queue='HighPriority',
                       region_name='eu-west-2',
                       memory=2048,
                       poll_time=10,
                       max_live_jobs=100)
Exemplo n.º 9
0
def kwarg_maker(dataset, routine_id):
    env_files = [
        f3p('config/mysqldb.config'),
        f3p('config/elasticsearch.config'),
        f3p('schema_transformations/eurito/'),
        f3p('nesta')
    ]
    batchable = f3p(f'batchables/eurito/{dataset}_eu')
    return dict(dataset=f'{dataset}-eu',
                routine_id=f'{dataset}-eu_{routine_id}',
                env_files=env_files,
                batchable=batchable)
Exemplo n.º 10
0
    def requires(self):
        logging.getLogger().setLevel(logging.INFO)
        keys = {
            'companies': {
                'index':
                'companies_v0',
                'score_field':
                'metric_novelty_organisation',
                'fields': [
                    'textBody_descriptive_organisation',
                    'textBody_summary_organisation'
                ]
            },
            'patstat': {
                'index': 'patstat_v0',
                'score_field': 'metric_novelty_patent',
                'fields': ['textBody_abstract_patent']
            },
            'arxiv': {
                'index': 'arxiv_v0',
                'score_field': 'metric_novelty_article',
                'fields': ['textBody_abstract_article']
            }
        }

        for dataset, kwargs in keys.items():
            routine_id = f'Lol_{dataset}_{self.production}_{self.date}'
            yield EsLolveltyTask(
                date=self.date,
                routine_id=routine_id,
                origin_index=kwargs.pop('index'),
                test=not self.production,
                dataset=dataset,
                entity_type=None,
                kwargs=kwargs,
                intermediate_bucket='eurito-intermediate-batch',
                batchable=f3p("batchables/novelty"
                              "/lolvelty"),
                env_files=[
                    f3p("eurito_daps/"),
                    f3p("config/mysqldb.config"),
                    f3p("config/"
                        "elasticsearch.config")
                ],
                job_def="py36_amzn1_image",
                job_name=routine_id,
                job_queue="HighPriority",
                region_name="eu-west-1",
                poll_time=10,
                memory=1024,
                max_live_jobs=30)
Exemplo n.º 11
0
 def requires(self):
     batchable = f3p("batchables/cordis/cordis_api")
     env_files = [f3p("nesta"), f3p("config/mysqldb.config")]
     routine_id = f'Cordis-{self.date}-{self.production}'
     return CordisCollectTask(routine_id=routine_id,
                              test=not self.production,
                              batchable=batchable,
                              env_files=env_files,
                              job_def="py36_amzn1_image",
                              job_name=f"Collect-{routine_id}",
                              job_queue="HighPriority",
                              region_name="eu-west-2",
                              poll_time=10,
                              memory=2048,
                              max_live_jobs=20)
Exemplo n.º 12
0
class CordisCollectTask(AutoBatchTask):
    process_batch_size = luigi.IntParameter(default=500)
    intermediate_bucket = luigi.Parameter(default=S3BUCKET)
    db_config_path = luigi.Parameter(default=f3p('config/mysqldb.config'))
    db_config_env = luigi.Parameter(default='MYSQLDB')
    routine_id = luigi.Parameter()

    def output(self):
        '''Points to the output database engine'''
        db_conf = get_config(self.db_config_path, "mysqldb")
        db_conf["database"] = 'dev' if self.test else 'production'
        db_conf["table"] = "CordisCollect <dummy>"  # not a real table
        update_id = self.job_name
        return MySqlTarget(update_id=update_id, **db_conf)

    def prepare(self):
        if self.test:
            self.process_batch_size = 100
        # MySQL setup
        database = 'dev' if self.test else 'production'
        engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        # Subtract off all done ids
        Base.metadata.create_all(engine)
        with db_session(engine) as session:
            result = session.query(Project.rcn).all()
            done_rcn = {r[0] for r in result}

        # Get all possible ids (or "RCN" in Cordis-speak)
        nrows = 1000 if self.test else None
        all_rcn = set(
            get_framework_ids('fp7', nrows=nrows) +
            get_framework_ids('h2020', nrows=nrows))
        all_rcn = all_rcn - done_rcn

        # Generate the job params
        batches = split_batches(all_rcn, self.process_batch_size)
        params = [{
            "batch_file":
            put_s3_batch(batch, self.intermediate_bucket, self.routine_id),
            "config":
            'mysqldb.config',
            "db_name":
            database,
            "bucket":
            self.intermediate_bucket,
            "outinfo":
            'dummy',
            "done":
            False,
            'test':
            self.test
        } for batch in batches]
        return params

    def combine(self, job_params):
        self.output().touch()
Exemplo n.º 13
0
 def requires(self):
     '''Collects the database configurations
     and executes the central task.'''
     logging.getLogger().setLevel(logging.INFO)
     yield CollectTask(
         date=self.date,
         _routine_id=self._routine_id,
         db_config_path=self.db_config_path,
         batchable=f3p("batchables/health_data/nih_collect_data"),
         env_files=[f3p("nesta/"),
                    f3p("/core/config/mysqldb.config")],
         job_def=self.job_def,
         job_name="CollectTask-%s" % self._routine_id,
         job_queue=self.job_queue,
         region_name=self.region_name,
         poll_time=10,
         test=self.test,
         memory=2048,
         max_live_jobs=2)
Exemplo n.º 14
0
 def requires(self):
     yield AbstractsMeshTask(date=self.date,
                             drop_and_recreate=self.drop_and_recreate,
                             _routine_id=self.routine_id,
                             db_config_path=self.db_config_path,
                             test=self.test,
                             batchable=f3p("batchables/health_data/"
                                           "nih_abstract_mesh_data"),
                             env_files=[
                                 f3p("nesta/"),
                                 f3p("config/mysqldb.config"),
                                 f3p("config/elasticsearch.config"),
                                 f3p("nih.json")
                             ],
                             job_def=self.job_def,
                             job_name="AbstractsMeshTask-%s" %
                             self.routine_id,
                             job_queue=self.job_queue,
                             region_name=self.region_name,
                             poll_time=self.poll_time,
                             memory=self.memory,
                             max_live_jobs=50)
Exemplo n.º 15
0
 def requires(self):
     '''Get the output from the batchtask'''
     _routine_id = "{}-{}".format(self.date, self.production)
     logging.getLogger().setLevel(logging.INFO)
     return TextVectors(
         date=self.date,
         batchable=("~/nesta/nesta/core/"
                    "batchables/embed_topics/"),
         test=not self.production,
         db_config_env="MYSQLDB",
         process_batch_size=self.process_batch_size,
         intermediate_bucket="nesta-production-intermediate",
         job_def="py36_amzn1_image",
         job_name="text2vectors-%s" % self.date,
         job_queue="HighPriority",
         region_name="eu-west-2",
         env_files=[f3p("nesta/nesta/"),
                    f3p("config/mysqldb.config")],
         routine_id=_routine_id,
         poll_time=10,
         memory=4096,
         max_live_jobs=5)
Exemplo n.º 16
0
 def requires(self):
     '''Collects the configurations and executes the previous task.'''
     logging.getLogger().setLevel(logging.INFO)
     yield ProcessTask(
         date=self.date,
         drop_and_recreate=self.drop_and_recreate,
         _routine_id=self._routine_id,
         db_config_path=self.db_config_path,
         batchable=f3p("batchables/health_data/nih_process_data"),
         env_files=[
             f3p("nesta/"),
             f3p("config/mysqldb.config"),
             f3p("config/elasticsearch.config"),
             f3p("nih.json")
         ],
         job_def=self.job_def,
         job_name="ProcessTask-%s" % self._routine_id,
         job_queue=self.job_queue,
         region_name=self.region_name,
         poll_time=10,
         test=self.test,
         memory=2048,
         max_live_jobs=2)
Exemplo n.º 17
0
 def requires(self):
     logging.getLogger().setLevel(logging.INFO)
     routine_id = (
         f"{self.date}-{'--'.join(self.core_categories)}"
         f"-{self.members_perc}-{self.topic_perc}-{self.production}")
     yield MeetupHealthSql2EsTask(
         routine_id=routine_id,
         date=self.date,
         process_batch_size=100,
         drop_and_recreate=self.drop_and_recreate,
         aliases='health_scanner',
         dataset='meetup',
         id_field=Group.id,
         entity_type='meetup',
         core_categories=self.core_categories,
         members_perc=self.members_perc,
         topic_perc=self.topic_perc,
         db_config_env=self.db_config_env,
         test=not self.production,
         intermediate_bucket='nesta-production-intermediate',
         batchable=f3p("batchables/meetup/topic_tag_elasticsearch"),
         env_files=[
             f3p("nesta/"),
             f3p("config/mysqldb.config"),
             f3p("schema_transformations/meetup.json"),
             f3p("config/elasticsearch.config")
         ],
         job_def="py36_amzn1_image",
         job_name=f"MeetupHealthSql2EsTask-{routine_id}",
         job_queue="MinimalCpus",
         region_name="eu-west-2",
         poll_time=10,
         memory=2048,
         vcpus=2,
         max_live_jobs=100,
         kwargs={"members_perc": self.members_perc})
Exemplo n.º 18
0
    def requires(self):
        """Collects the database configurations and executes the central task."""
        _routine_id = "{}-{}".format(self.date, self.production)

        logging.getLogger().setLevel(logging.INFO)
        yield MyBatchTaskWhichNeedsAName(
            date=self.date,
            _routine_id=_routine_id,
            test=not self.production,
            db_config_env="MYSQLDB",
            batch_size=self.batch_size,  # example parameter
            start_string=self.start_string,  # example parameter
            intermediate_bucket="nesta-production-intermediate",
            batchable=f3p(
                "batchables/examples/template_batchable"),  # folder name
            env_files=[f3p("nesta/nesta/"),
                       f3p("config/mysqldb.config")],
            job_def="py36_amzn1_image",
            job_name=f"MyBatchTaskWhichNeedsAName-{_routine_id}",
            job_queue="LowPriority",
            region_name="eu-west-2",
            poll_time=10,
            memory=2048,
            max_live_jobs=10)
Exemplo n.º 19
0
 def requires(self):
     logging.getLogger().setLevel(logging.INFO)
     test = not self.production
     routine_id = f"ArxivLolveltyTask-{self.date}-{test}"
     index = 'arxiv_v1' if self.production else 'arxiv_dev'
     return ArxivESTokenTask(routine_id=routine_id,
                             test=test,
                             index=index,
                             dataset='arxiv',
                             entity_type='article',
                             kwargs=kwargs,
                             batchable=f3p("batchables/arxiv/"
                                           "/arxiv_es_tokens"),
                             env_files=[f3p("nesta/"),
                                        f3p("config/mysqldb.config"),
                                        f3p("config/"
                                            "elasticsearch.config")],
                             job_def="py36_amzn1_image",
                             job_name=routine_id,
                             job_queue="HighPriority",
                             region_name="eu-west-2",
                             poll_time=10,
                             memory=1024,
                             max_live_jobs=30)
Exemplo n.º 20
0
    def requires(self):
        '''Collects the database configurations
        and executes the central task.'''
        _routine_id = "{}-{}".format(self.date, self.production)
        grid_task_kwargs = {
            '_routine_id': _routine_id,
            'db_config_path': self.db_config_path,
            'db_config_env': 'MYSQLDB',
            'mag_config_path': 'mag.config',
            'test': not self.production,
            'insert_batch_size': self.insert_batch_size,
            'articles_from_date': self.articles_from_date,
            'date': self.date,
        }

        cherry_picked = (f'automl/{self.date}/COREX_TOPIC_MODEL'
                         '.n_hidden_27-0.VECTORIZER.binary_True'
                         f'.min_df_0-001.NGRAM.TEST_False.json')
        if not self.production:
            cherry_picked = (f'automl/{self.date}/COREX_TOPIC_MODEL'
                             '.n_hidden_36-0.VECTORIZER.binary_True'
                             '.min_df_0-001.NGRAM.TEST_True.json')

        kwargs = {
            'score_field': 'metric_novelty_article',
            'fields': ['textBody_abstract_article']
        }
        test = not self.production
        routine_id = f"ArxivLolveltyTask-{self.date}-{test}"

        # Elasticsearch setup
        dataset = 'arxiv'
        _, es_config = setup_es('prod' if self.production else 'dev',
                                not self.production,
                                self.drop_and_recreate,
                                dataset=dataset)
        yield ArxivElasticsearchTask(date=self.date,
                                     routine_id=routine_id,
                                     grid_task_kwargs=grid_task_kwargs,
                                     test=not self.production,
                                     index=es_config['index'],
                                     dataset='arxiv',
                                     entity_type='article',
                                     kwargs=kwargs,
                                     batchable=f3p("batchables/novelty"
                                                   "/lolvelty"),
                                     env_files=[
                                         f3p("nesta/"),
                                         f3p("config/mysqldb.config"),
                                         f3p("config/"
                                             "elasticsearch.config")
                                     ],
                                     job_def="py36_amzn1_image",
                                     job_name=routine_id,
                                     job_queue="HighPriority",
                                     region_name="eu-west-2",
                                     poll_time=10,
                                     memory=1024,
                                     max_live_jobs=30)

        yield AnalysisTask(date=self.date,
                           grid_task_kwargs=grid_task_kwargs,
                           _routine_id=_routine_id,
                           db_config_path=self.db_config_path,
                           db_config_env='MYSQLDB',
                           mag_config_path='mag.config',
                           test=not self.production,
                           insert_batch_size=self.insert_batch_size,
                           articles_from_date=self.articles_from_date,
                           cherry_picked=cherry_picked)
Exemplo n.º 21
0
from datetime import datetime as dt

from nesta.packages.arxiv import deepchange_analysis as dc
from nesta.core.luigihacks.misctools import find_filepath_from_pathstub as f3p
from nesta.core.luigihacks.misctools import get_config
from nesta.core.luigihacks import mysqldb
from nesta.core.luigihacks.luigi_logging import set_log_level
from nesta.core.orms.orm_utils import get_mysql_engine
from nesta.core.routines.arxiv.arxiv_topic_tasks import WriteTopicTask
from nesta.core.luigihacks.parameter import DictParameterPlus

matplotlib.rcParams['figure.figsize'] = (20, 13)
matplotlib.rcParams.update({'font.size': 25, "axes.labelpad": 10})

ORDERED_QUERIES = [
    f3p(x) for x in ('arxlive1_filter_cats.sql', 'arxlive2_join_insts.sql',
                     'arxlive3_group_cats.sql', 'arxlive4_read_final.sql')
]
YEAR_THRESHOLD = 2012
MIN_RCA_YEAR = 2007  # minimum year when calculating rca pre 2012
N_TOP = 15  # number of countries / cities / categories to show
COLOR_A = '#631607'
COLOR_B = '#d68b7a'
STATIC_FILES_BUCKET = 'arxlive-static-files'


def sql_queries():
    for i, filepath in enumerate(ORDERED_QUERIES):
        is_last = bool(i + 1 == len(ORDERED_QUERIES))
        with open(filepath) as f:
            query = f.read()