def requires(self): logging.getLogger().setLevel(logging.INFO) kwargs = {'score_field': 'rank_rhodonite_group', 'fields': ['name_of_group', 'textBody_descriptive_group', 'terms_topics_group']} test = not self.production routine_id = f"MeetupLolveltyTask-{self.date}-{test}" index = self.index if self.production else 'meetup_dev' assert index is not None return LazyElasticsearchTask(routine_id=routine_id, test=test, index=index, dataset='meetup', entity_type='meetup', kwargs=kwargs, batchable=f3p("batchables/novelty/lolvelty"), env_files=[f3p("nesta/"), f3p("config/mysqldb.config"), f3p("config/elasticsearch.config")], job_def="py36_amzn1_image", job_name=routine_id, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=1024, max_live_jobs=10)
def requires(self): '''Collects the database configurations and executes the central task.''' _routine_id = "{}-{}".format(self.date, self.production) logging.getLogger().setLevel(logging.INFO) yield ProcessTask( date=self.date, drop_and_recreate=self.drop_and_recreate, _routine_id=_routine_id, db_config_path=self.db_config_path, batchable=f3p("batchables/health_data/nih_process_data"), env_files=[ f3p("nesta/"), f3p("config/mysqldb.config"), f3p("config/elasticsearch.config"), f3p("nih.json") ], job_def="py36_amzn1_image", job_name="ProcessTask-%s" % _routine_id, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, test=not self.production, memory=2048, max_live_jobs=2)
def requires(self): logging.getLogger().setLevel(logging.INFO) kwargs = { 'score_field': 'metric_novelty_article', 'fields': ['textBody_abstract_article'] } test = not self.production routine_id = f"ArxivLolveltyTask-{self.date}-{test}" index = 'arxiv_v3' if self.production else 'arxiv_dev' return _ArxivElasticsearchTask(routine_id=routine_id, test=test, index=index, dataset='arxiv', entity_type='article', kwargs=kwargs, batchable=f3p("batchables/novelty" "/lolvelty"), env_files=[ f3p("nesta/"), f3p("config/mysqldb.config"), f3p("config/" "elasticsearch.config") ], job_def="py36_amzn1_image", job_name=routine_id, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=1024, max_live_jobs=30)
def requires(self): '''Collects the database configurations and executes the central task.''' _routine_id = "{}-{}".format(self.date, self.production) logging.getLogger().setLevel(logging.INFO) yield DedupeTask(date=self.date, drop_and_recreate=self.drop_and_recreate, routine_id=_routine_id, db_config_path=self.db_config_path, process_batch_size=5000, intermediate_bucket='nesta-production-intermediate', test=(not self.production), batchable=f3p("batchables/health_data/" "nih_dedupe"), env_files=[ f3p("nesta/"), f3p("config/mysqldb.config"), f3p("config/elasticsearch.config"), f3p("nih.json") ], job_def="py36_amzn1_image", job_name="NiHDedupeTask-%s" % _routine_id, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=1024, max_live_jobs=20)
def requires(self): '''Collects the database configurations and executes the central task.''' routine_id = "ArxivESTask-{}-{}".format(self.date, self.production) logging.getLogger().setLevel(logging.INFO) yield Sql2EsTask(routine_id=routine_id, date=self.date, process_batch_size=10000, drop_and_recreate=self.drop_and_recreate, dataset='arxiv', id_field=Article.id, entity_type='article', db_config_env='MYSQLDB', test=not self.production, intermediate_bucket=('nesta-production' '-intermediate'), batchable=f3p('batchables/arxiv/' 'arxiv_elasticsearch'), env_files=[ f3p('nesta/'), f3p('config/' 'mysqldb.config'), f3p('schema_transformations/' 'arxiv.json'), f3p('config/' 'elasticsearch.config') ], job_def='py36_amzn1_image', job_name=routine_id, job_queue='HighPriority', region_name='eu-west-2', memory=2048, poll_time=10, max_live_jobs=100)
def requires(self): '''Collects the database configurations and executes the central task.''' _routine_id = "{}-{}".format(self.date, self.production) logging.getLogger().setLevel(logging.INFO) yield CrunchbaseSql2EsTask( date=self.date, _routine_id=_routine_id, test=not self.production, drop_and_recreate=self.drop_and_recreate, db_config_env="MYSQLDB", insert_batch_size=self.insert_batch_size, process_batch_size=50000, intermediate_bucket='nesta-production-intermediate', batchable=f3p( "core/batchables/crunchbase/crunchbase_elasticsearch"), env_files=[ f3p("nesta/"), f3p("config/mysqldb.config"), f3p("schema_transformations/crunchbase_organisation_members.json" ), f3p("config/elasticsearch.config") ], job_def="py36_amzn1_image", job_name=f"CrunchBaseElasticsearchTask-{_routine_id}", job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=2048, max_live_jobs=100)
def requires(self): yield GeocodeBatchTask( _routine_id=self.routine_id, test=self.test, test_limit=None, db_config_env=self.db_config_env, city_col=Group.city, country_col=Group.country, country_is_iso2=True, env_files=[f3p("nesta/"), f3p("config/mysqldb.config")], job_def="py36_amzn1_image", job_name=f"HealthMeetupGeocodeBatchTask-{self.routine_id}", job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=4096, max_live_jobs=2) yield TopicDiscoveryTask(routine_id=self.routine_id, core_categories=self.core_categories, members_perc=self.members_perc, topic_perc=self.topic_perc, db_config_env=self.db_config_env, test=self.test)
def requires(self): yield ArxivESTask(routine_id=self.routine_id, date=self.date, grid_task_kwargs=self.grid_task_kwargs, process_batch_size=10000, drop_and_recreate=self.drop_and_recreate, dataset='arxiv', id_field=Article.id, entity_type='article', db_config_env='MYSQLDB', test=self.test, intermediate_bucket=('nesta-production' '-intermediate'), batchable=f3p('batchables/arxiv/' 'arxiv_elasticsearch'), env_files=[ f3p('nesta/'), f3p('config/' 'mysqldb.config'), f3p('schema_transformations/' 'arxiv.json'), f3p('config/' 'elasticsearch.config') ], job_def='py36_amzn1_image', job_name=self.routine_id, job_queue='HighPriority', region_name='eu-west-2', memory=2048, poll_time=10, max_live_jobs=100)
def kwarg_maker(dataset, routine_id): env_files = [ f3p('config/mysqldb.config'), f3p('config/elasticsearch.config'), f3p('schema_transformations/eurito/'), f3p('nesta') ] batchable = f3p(f'batchables/eurito/{dataset}_eu') return dict(dataset=f'{dataset}-eu', routine_id=f'{dataset}-eu_{routine_id}', env_files=env_files, batchable=batchable)
def requires(self): logging.getLogger().setLevel(logging.INFO) keys = { 'companies': { 'index': 'companies_v0', 'score_field': 'metric_novelty_organisation', 'fields': [ 'textBody_descriptive_organisation', 'textBody_summary_organisation' ] }, 'patstat': { 'index': 'patstat_v0', 'score_field': 'metric_novelty_patent', 'fields': ['textBody_abstract_patent'] }, 'arxiv': { 'index': 'arxiv_v0', 'score_field': 'metric_novelty_article', 'fields': ['textBody_abstract_article'] } } for dataset, kwargs in keys.items(): routine_id = f'Lol_{dataset}_{self.production}_{self.date}' yield EsLolveltyTask( date=self.date, routine_id=routine_id, origin_index=kwargs.pop('index'), test=not self.production, dataset=dataset, entity_type=None, kwargs=kwargs, intermediate_bucket='eurito-intermediate-batch', batchable=f3p("batchables/novelty" "/lolvelty"), env_files=[ f3p("eurito_daps/"), f3p("config/mysqldb.config"), f3p("config/" "elasticsearch.config") ], job_def="py36_amzn1_image", job_name=routine_id, job_queue="HighPriority", region_name="eu-west-1", poll_time=10, memory=1024, max_live_jobs=30)
def requires(self): batchable = f3p("batchables/cordis/cordis_api") env_files = [f3p("nesta"), f3p("config/mysqldb.config")] routine_id = f'Cordis-{self.date}-{self.production}' return CordisCollectTask(routine_id=routine_id, test=not self.production, batchable=batchable, env_files=env_files, job_def="py36_amzn1_image", job_name=f"Collect-{routine_id}", job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=2048, max_live_jobs=20)
class CordisCollectTask(AutoBatchTask): process_batch_size = luigi.IntParameter(default=500) intermediate_bucket = luigi.Parameter(default=S3BUCKET) db_config_path = luigi.Parameter(default=f3p('config/mysqldb.config')) db_config_env = luigi.Parameter(default='MYSQLDB') routine_id = luigi.Parameter() def output(self): '''Points to the output database engine''' db_conf = get_config(self.db_config_path, "mysqldb") db_conf["database"] = 'dev' if self.test else 'production' db_conf["table"] = "CordisCollect <dummy>" # not a real table update_id = self.job_name return MySqlTarget(update_id=update_id, **db_conf) def prepare(self): if self.test: self.process_batch_size = 100 # MySQL setup database = 'dev' if self.test else 'production' engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) # Subtract off all done ids Base.metadata.create_all(engine) with db_session(engine) as session: result = session.query(Project.rcn).all() done_rcn = {r[0] for r in result} # Get all possible ids (or "RCN" in Cordis-speak) nrows = 1000 if self.test else None all_rcn = set( get_framework_ids('fp7', nrows=nrows) + get_framework_ids('h2020', nrows=nrows)) all_rcn = all_rcn - done_rcn # Generate the job params batches = split_batches(all_rcn, self.process_batch_size) params = [{ "batch_file": put_s3_batch(batch, self.intermediate_bucket, self.routine_id), "config": 'mysqldb.config', "db_name": database, "bucket": self.intermediate_bucket, "outinfo": 'dummy', "done": False, 'test': self.test } for batch in batches] return params def combine(self, job_params): self.output().touch()
def requires(self): '''Collects the database configurations and executes the central task.''' logging.getLogger().setLevel(logging.INFO) yield CollectTask( date=self.date, _routine_id=self._routine_id, db_config_path=self.db_config_path, batchable=f3p("batchables/health_data/nih_collect_data"), env_files=[f3p("nesta/"), f3p("/core/config/mysqldb.config")], job_def=self.job_def, job_name="CollectTask-%s" % self._routine_id, job_queue=self.job_queue, region_name=self.region_name, poll_time=10, test=self.test, memory=2048, max_live_jobs=2)
def requires(self): yield AbstractsMeshTask(date=self.date, drop_and_recreate=self.drop_and_recreate, _routine_id=self.routine_id, db_config_path=self.db_config_path, test=self.test, batchable=f3p("batchables/health_data/" "nih_abstract_mesh_data"), env_files=[ f3p("nesta/"), f3p("config/mysqldb.config"), f3p("config/elasticsearch.config"), f3p("nih.json") ], job_def=self.job_def, job_name="AbstractsMeshTask-%s" % self.routine_id, job_queue=self.job_queue, region_name=self.region_name, poll_time=self.poll_time, memory=self.memory, max_live_jobs=50)
def requires(self): '''Get the output from the batchtask''' _routine_id = "{}-{}".format(self.date, self.production) logging.getLogger().setLevel(logging.INFO) return TextVectors( date=self.date, batchable=("~/nesta/nesta/core/" "batchables/embed_topics/"), test=not self.production, db_config_env="MYSQLDB", process_batch_size=self.process_batch_size, intermediate_bucket="nesta-production-intermediate", job_def="py36_amzn1_image", job_name="text2vectors-%s" % self.date, job_queue="HighPriority", region_name="eu-west-2", env_files=[f3p("nesta/nesta/"), f3p("config/mysqldb.config")], routine_id=_routine_id, poll_time=10, memory=4096, max_live_jobs=5)
def requires(self): '''Collects the configurations and executes the previous task.''' logging.getLogger().setLevel(logging.INFO) yield ProcessTask( date=self.date, drop_and_recreate=self.drop_and_recreate, _routine_id=self._routine_id, db_config_path=self.db_config_path, batchable=f3p("batchables/health_data/nih_process_data"), env_files=[ f3p("nesta/"), f3p("config/mysqldb.config"), f3p("config/elasticsearch.config"), f3p("nih.json") ], job_def=self.job_def, job_name="ProcessTask-%s" % self._routine_id, job_queue=self.job_queue, region_name=self.region_name, poll_time=10, test=self.test, memory=2048, max_live_jobs=2)
def requires(self): logging.getLogger().setLevel(logging.INFO) routine_id = ( f"{self.date}-{'--'.join(self.core_categories)}" f"-{self.members_perc}-{self.topic_perc}-{self.production}") yield MeetupHealthSql2EsTask( routine_id=routine_id, date=self.date, process_batch_size=100, drop_and_recreate=self.drop_and_recreate, aliases='health_scanner', dataset='meetup', id_field=Group.id, entity_type='meetup', core_categories=self.core_categories, members_perc=self.members_perc, topic_perc=self.topic_perc, db_config_env=self.db_config_env, test=not self.production, intermediate_bucket='nesta-production-intermediate', batchable=f3p("batchables/meetup/topic_tag_elasticsearch"), env_files=[ f3p("nesta/"), f3p("config/mysqldb.config"), f3p("schema_transformations/meetup.json"), f3p("config/elasticsearch.config") ], job_def="py36_amzn1_image", job_name=f"MeetupHealthSql2EsTask-{routine_id}", job_queue="MinimalCpus", region_name="eu-west-2", poll_time=10, memory=2048, vcpus=2, max_live_jobs=100, kwargs={"members_perc": self.members_perc})
def requires(self): """Collects the database configurations and executes the central task.""" _routine_id = "{}-{}".format(self.date, self.production) logging.getLogger().setLevel(logging.INFO) yield MyBatchTaskWhichNeedsAName( date=self.date, _routine_id=_routine_id, test=not self.production, db_config_env="MYSQLDB", batch_size=self.batch_size, # example parameter start_string=self.start_string, # example parameter intermediate_bucket="nesta-production-intermediate", batchable=f3p( "batchables/examples/template_batchable"), # folder name env_files=[f3p("nesta/nesta/"), f3p("config/mysqldb.config")], job_def="py36_amzn1_image", job_name=f"MyBatchTaskWhichNeedsAName-{_routine_id}", job_queue="LowPriority", region_name="eu-west-2", poll_time=10, memory=2048, max_live_jobs=10)
def requires(self): logging.getLogger().setLevel(logging.INFO) test = not self.production routine_id = f"ArxivLolveltyTask-{self.date}-{test}" index = 'arxiv_v1' if self.production else 'arxiv_dev' return ArxivESTokenTask(routine_id=routine_id, test=test, index=index, dataset='arxiv', entity_type='article', kwargs=kwargs, batchable=f3p("batchables/arxiv/" "/arxiv_es_tokens"), env_files=[f3p("nesta/"), f3p("config/mysqldb.config"), f3p("config/" "elasticsearch.config")], job_def="py36_amzn1_image", job_name=routine_id, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=1024, max_live_jobs=30)
def requires(self): '''Collects the database configurations and executes the central task.''' _routine_id = "{}-{}".format(self.date, self.production) grid_task_kwargs = { '_routine_id': _routine_id, 'db_config_path': self.db_config_path, 'db_config_env': 'MYSQLDB', 'mag_config_path': 'mag.config', 'test': not self.production, 'insert_batch_size': self.insert_batch_size, 'articles_from_date': self.articles_from_date, 'date': self.date, } cherry_picked = (f'automl/{self.date}/COREX_TOPIC_MODEL' '.n_hidden_27-0.VECTORIZER.binary_True' f'.min_df_0-001.NGRAM.TEST_False.json') if not self.production: cherry_picked = (f'automl/{self.date}/COREX_TOPIC_MODEL' '.n_hidden_36-0.VECTORIZER.binary_True' '.min_df_0-001.NGRAM.TEST_True.json') kwargs = { 'score_field': 'metric_novelty_article', 'fields': ['textBody_abstract_article'] } test = not self.production routine_id = f"ArxivLolveltyTask-{self.date}-{test}" # Elasticsearch setup dataset = 'arxiv' _, es_config = setup_es('prod' if self.production else 'dev', not self.production, self.drop_and_recreate, dataset=dataset) yield ArxivElasticsearchTask(date=self.date, routine_id=routine_id, grid_task_kwargs=grid_task_kwargs, test=not self.production, index=es_config['index'], dataset='arxiv', entity_type='article', kwargs=kwargs, batchable=f3p("batchables/novelty" "/lolvelty"), env_files=[ f3p("nesta/"), f3p("config/mysqldb.config"), f3p("config/" "elasticsearch.config") ], job_def="py36_amzn1_image", job_name=routine_id, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=1024, max_live_jobs=30) yield AnalysisTask(date=self.date, grid_task_kwargs=grid_task_kwargs, _routine_id=_routine_id, db_config_path=self.db_config_path, db_config_env='MYSQLDB', mag_config_path='mag.config', test=not self.production, insert_batch_size=self.insert_batch_size, articles_from_date=self.articles_from_date, cherry_picked=cherry_picked)
from datetime import datetime as dt from nesta.packages.arxiv import deepchange_analysis as dc from nesta.core.luigihacks.misctools import find_filepath_from_pathstub as f3p from nesta.core.luigihacks.misctools import get_config from nesta.core.luigihacks import mysqldb from nesta.core.luigihacks.luigi_logging import set_log_level from nesta.core.orms.orm_utils import get_mysql_engine from nesta.core.routines.arxiv.arxiv_topic_tasks import WriteTopicTask from nesta.core.luigihacks.parameter import DictParameterPlus matplotlib.rcParams['figure.figsize'] = (20, 13) matplotlib.rcParams.update({'font.size': 25, "axes.labelpad": 10}) ORDERED_QUERIES = [ f3p(x) for x in ('arxlive1_filter_cats.sql', 'arxlive2_join_insts.sql', 'arxlive3_group_cats.sql', 'arxlive4_read_final.sql') ] YEAR_THRESHOLD = 2012 MIN_RCA_YEAR = 2007 # minimum year when calculating rca pre 2012 N_TOP = 15 # number of countries / cities / categories to show COLOR_A = '#631607' COLOR_B = '#d68b7a' STATIC_FILES_BUCKET = 'arxlive-static-files' def sql_queries(): for i, filepath in enumerate(ORDERED_QUERIES): is_last = bool(i + 1 == len(ORDERED_QUERIES)) with open(filepath) as f: query = f.read()