def requires(self): logging.getLogger().setLevel(logging.INFO) db_config = misctools.get_config("mysqldb.config", "mysqldb") db_config["database"] = "production" if self.production else "dev" db_config["table"] = "worldbank_countries" variable_codes = [ "SP.RUR.TOTL.ZS", "SP.URB.TOTL.IN.ZS" "SP.POP.DPND", "SP.POP.TOTL", "SP.DYN.LE00.IN", "SP.DYN.IMRT.IN", "BAR.NOED.25UP.ZS", "BAR.TER.CMPT.25UP.ZS", "NYGDPMKTPSAKD", "SI.POV.NAHC", "SI.POV.GINI" ] job_name = (f"Worldbank-{self.date}-" f"{'_'.join(variable_codes).replace('.','_')}-" f"{self.production}")[0:120] yield WorldbankTask( date=self.date, db_config=db_config, variable_codes=variable_codes, batchable=find_filepath_from_pathstub( "core/batchables/collect_worldbank/"), env_files=[ find_filepath_from_pathstub("/nesta/nesta"), find_filepath_from_pathstub("/config/mysqldb.config") ], job_def="py36_amzn1_image", job_name=job_name, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, max_live_jobs=200, test=(not self.production))
def requires(self): '''Collects the database configurations and executes the central task.''' logging.getLogger().setLevel(logging.INFO) yield InvestorGeocodeTask( date=self.date, _routine_id=self._routine_id, test=self.test, db_config_env="MYSQLDB", city_col=Investor.city, country_col=Investor.country, location_key_col=Investor.location_id, insert_batch_size=self.insert_batch_size, env_files=[ find_filepath_from_pathstub("nesta/nesta/"), find_filepath_from_pathstub("config/mysqldb.config"), find_filepath_from_pathstub("config/crunchbase.config") ], job_def="py36_amzn1_image", job_name=f"CrunchBaseInvestorGeocodeTask-{self._routine_id}", job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=4096, max_live_jobs=2)
def requires(self): '''Collects the database configurations and executes the central task.''' logging.getLogger().setLevel(logging.INFO) _routine_id = f"{self.date}-{self.iso2}-{self.category}-{self.production}" engine = get_mysql_engine("MYSQLDB", "mysqldb", "production" if self.production else "dev") Base.metadata.create_all(engine) yield GroupDetailsTask( iso2=self.iso2, category=self.category, _routine_id=_routine_id, batchable=BATCHABLE.format("group_details"), env_files=[ find_filepath_from_pathstub("/nesta/nesta"), find_filepath_from_pathstub("/config/mysqldb.config") ], job_def="py36_amzn1_image", job_name="GroupDetails-%s" % _routine_id, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, max_live_jobs=100, test=(not self.production))
def requires(self): '''Collects the database configurations and executes the central task.''' yield GtrTask(date=self.date, page_size=self.page_size, batchable=find_filepath_from_pathstub("core/batchables/gtr/collect_gtr"), env_files=[find_filepath_from_pathstub("/nesta/nesta"), find_filepath_from_pathstub("/config/mysqldb.config")], job_def="py36_amzn1_image", job_name=f"GtR-{self.date}-{self.page_size}-{not self.test}", job_queue="HighPriority", region_name="eu-west-2", poll_time=10, test=self.test)
def expand_pathstub(pathstub): """Expand the pathstub. Args: pathstub (list or str): A pathstub or list of pathstubs to expand. Returns: fullpath (list or str): A fullpath or list of fullpaths """ # Expand from a list... if type(pathstub) is list: return [find_filepath_from_pathstub(_v) for _v in pathstub] # ...or from a string (assumed) else: return find_filepath_from_pathstub(pathstub)
def test_aliases(self): """Assert consistency between the aliases and schemas""" top_dir = find_filepath_from_pathstub("core/orms") all_fields = {} for filename in os.listdir(top_dir): if not filename.endswith(ES_CONF_SUFFIX): continue dataset = filename.replace(ES_CONF_SUFFIX, "") filename = os.path.join(top_dir, filename) with open(filename) as f: data = json.load(f) print(f'Found {filename}') fields = data["mappings"]["_doc"]["properties"].keys() all_fields[dataset] = fields cwd = os.path.dirname(__file__) path = os.path.join(cwd, '../aliases/') for filename in os.listdir(path): if not filename.endswith(".json"): continue filename = os.path.join(path, filename) for alias, dataset, field in alias_info(filename): print("\t", alias, dataset, field) assert dataset in all_fields.keys() assert field in all_fields[dataset]
def requires(self): yield OrgGeocodeTask(date=self.date, _routine_id=self._routine_id, test=self.test, db_config_env="MYSQLDB", city_col=Organization.city, country_col=Organization.country, location_key_col=Organization.location_id, insert_batch_size=self.insert_batch_size, env_files=[find_filepath_from_pathstub("nesta/nesta/"), find_filepath_from_pathstub("config/mysqldb.config")], job_def="py36_amzn1_image", job_name=f"CrunchBaseOrgGeocodeTask-{self._routine_id}", job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=4096, max_live_jobs=2)
def requires(self): '''Collects the database configurations and executes the central task.''' logging.getLogger().setLevel(logging.INFO) yield GtrTask(date=self.date, page_size=self.page_size, batchable=find_filepath_from_pathstub("core/batchables/gtr/collect_gtr"), env_files=[find_filepath_from_pathstub("/nesta/nesta"), find_filepath_from_pathstub("/config/mysqldb.config")], job_def="py36_amzn1_image", job_name=f"GtR-{self.date}-{self.page_size}-{self.production}", #job_queue="HighPriority", job_queue="MinimalCpus", region_name="eu-west-2", vcpus=2, poll_time=10, memory=2048, max_live_jobs=50, test=(not self.production))
def get_gss_codes(test=False): """Get all UK geography codes. Returns: List of UK geography codes. """ CONFIG = find_filepath_from_pathstub("fetch_geography_codes.sparql") n = 1 if test else None with open(CONFIG) as f: query = f.read().replace("\n", " ") data = sparql_query(ENDPOINT, query, batch_limit=n) return [row["area_code"] for batch in data for row in batch]
def run(): '''Gets the name and age of the muppet, and increments the age. The result is transferred to S3.''' # Get parameters for the batch job dataset = os.environ["BATCHPAR_dataset"] start_index = int(os.environ["BATCHPAR_start_index"]) end_index = int(os.environ["BATCHPAR_end_index"]) age_increment = int(os.environ["BATCHPAR_age_increment"]) es_host = os.environ["BATCHPAR_outinfo"] es_port = os.environ["BATCHPAR_out_port"] es_index = os.environ["BATCHPAR_out_index"] es_type = os.environ["BATCHPAR_out_type"] entity_type = os.environ["BATCHPAR_entity_type"] aws_auth_region = os.environ["BATCHPAR_aws_auth_region"] # Get the input data and modify it based on the input parameters data = silo.get("example")[start_index:end_index] for row in data: row["age"] = row["age"] + age_increment # Connect to ES field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/", "example.json") strans_fpath = find_filepath_from_pathstub("tier_1/schema_transformations/" "example.json") strans_kwargs={'filename':strans_fpath, 'from_key':'tier_0', 'to_key':'tier_1', 'ignore':['id']} es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, no_commit=("AWSBATCHTEST" in os.environ), entity_type=entity_type, strans_kwargs=strans_kwargs, field_null_mapping=field_null_mapping, null_empty_str=True, coordinates_as_floats=True, country_detection=False, caps_to_camel_case=True) for uid, row in enumerate(data): uid = uid + start_index es.index(index=es_index, doc_type=es_type, id=uid, body=row) # Also upload the data to S3 silo.put(data, dataset)
def requires(self): '''Collects the database configurations and executes the central task.''' logging.getLogger().setLevel(logging.INFO) yield NonOrgCollectTask( date=self.date, _routine_id=self._routine_id, test=self.test, db_config_path=find_filepath_from_pathstub("mysqldb.config"), insert_batch_size=self.insert_batch_size, batchable=find_filepath_from_pathstub( "batchables/crunchbase/crunchbase_collect"), env_files=[ find_filepath_from_pathstub("nesta/nesta/"), find_filepath_from_pathstub("config/mysqldb.config"), find_filepath_from_pathstub("config/crunchbase.config") ], job_def="py36_amzn1_image", job_name=f"CrunchBaseNonOrgCollectTask-{self._routine_id}", job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=4096, max_live_jobs=20)
def load_json_from_pathstub(pathstub, filename, sort_on_load=True): """Basic wrapper around :obj:`find_filepath_from_pathstub` which also opens the file (assumed to be json). Args: pathstub (str): Stub of filepath where the file should be found. filename (str): The filename. Returns: The file contents as a json object. """ _path = find_filepath_from_pathstub(pathstub) _path = os.path.join(_path, filename) with open(_path) as f: js = json.load(f) if sort_on_load: _js = json.dumps(js, sort_keys=True) js = json.loads(_js) return js
# -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys sys.path.insert(0, os.path.abspath('.')) sys.path.insert(0, os.path.abspath('../../eurito_daps/')) sys.path.insert(0, os.path.abspath('../../eurito_daps/core')) from nesta.core.luigihacks.misctools import find_filepath_from_pathstub try: find_filepath_from_pathstub('luigi.cfg') except FileNotFoundError: config_dir = 'core/config' os.makedirs(config_dir) with open(os.path.join(config_dir, 'luigi.cfg'), 'w') as f: f.write('[worker]\nx=1') with open(os.path.join(config_dir, 'mysqldb.config'), 'w') as f: f.write('[mysqldb]\nx=1') print(os.listdir(config_dir)) # -- Project information ----------------------------------------------------- project = 'eurito' copyright = '2019, EURITO' author = 'EURITO'
for calls, project_calls in split_links(_calls, rcn): data['proposal_calls'].append(calls) data['project_proposal_calls'].append(project_calls) # Pipe the data to the db for table_prefix, rows in data.items(): table_name = f'cordis_{table_prefix}' logging.info(table_name) _class = get_class_by_tablename(Base, table_name) insert_data(db_env, db_section, db_name, Base, _class, rows, low_memory=True) if __name__ == "__main__": set_log_level(True) if 'BATCHPAR_config' not in os.environ: from nesta.core.luigihacks.misctools import find_filepath_from_pathstub os.environ['BATCHPAR_batch_file'] = ( 'Cordis-2020-04-12-True-1586709686976328.json') os.environ['BATCHPAR_db_name'] = 'production' os.environ["BATCHPAR_config"] = find_filepath_from_pathstub( 'mysqldb.config'), os.environ["BATCHPAR_bucket"] = ('nesta-production' '-intermediate') run()
def mock_response(): test_file = find_filepath_from_pathstub('mocked_arxiv_response.xml') with open(test_file, mode='rb') as f: return f.read()
def test_find_filepath_from_pathstub(self): find_filepath_from_pathstub("nesta/packages") with self.assertRaises(FileNotFoundError): find_filepath_from_pathstub("nesta/package")
from nesta.core.orms.orm_utils import get_mysql_engine import luigi import datetime import json import time import logging from botocore.errorfactory import ClientError import boto3 import os # Define these globally since they are shared resources # TODO: consider bundling this into a Singleton S3 = boto3.resource('s3') _BUCKET = S3.Bucket("nesta-production-intermediate") DONE_KEYS = set(obj.key for obj in _BUCKET.objects.all()) BATCHABLE = os.path.join(find_filepath_from_pathstub("core/batchables/meetup"), "{}") def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] class CountryGroupsTask(autobatch.AutoBatchTask): '''Extract all groups with corresponding category for this country. Args:
class GeocodeBatchTask(AutoBatchTask): """Appends various geographic codes to the geographic_data table using the `city` and `country` from the input table: lat/long, iso codes, continent. To implement this task, only the `output` and `combine` methods need to be defined when it is subclassed. Args: test (bool): in test or production mode db_config_env (str): environmental variable pointing to the db config file city_col (:obj:`sqlalchemy.Column`): column containing the city country_col (:obj:`sqlalchemy.Column`): column containing the full name of the country location_key_col (:obj:`sqlalchemy.Column`): column containing the generated composite key batch_size (int): number of locations to geocode in a batch intermediate_bucket (str): s3 bucket where the batch data will be stored batchable (str): location of the batchable run.py """ test = luigi.BoolParameter() _routine_id = luigi.Parameter(default="DUMMY ROUTINE") db_config_env = luigi.Parameter() city_col = luigi.Parameter() country_col = luigi.Parameter() country_is_iso2 = luigi.BoolParameter(default=False) location_key_col = luigi.Parameter(default=None) batch_size = luigi.IntParameter(default=1000) intermediate_bucket = luigi.Parameter( default="nesta-production-intermediate") batchable = luigi.Parameter( default=find_filepath_from_pathstub("batchables/batchgeocode")) test_limit = luigi.IntParameter(default=100) def output(self): '''Points to the output database engine''' db_config = get_config(os.environ[self.db_config_env], "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config[ "table"] = f"BatchGeocode{self._routine_id} <dummy>" # Note, not a real table return MySqlTarget(update_id=f"BatchGeocode-{self._routine_id}", **db_config) def _insert_new_locations(self): """Checks for new city/country combinations and appends them to the geographic data table in mysql. """ limit = self.test_limit if self.test else None with db_session(self.engine) as session: existing_location_ids = { i[0] for i in session.query(Geographic.id).all() } new_locations = [] for city, country, key in (session.query( self.city_col, self.country_col, self.location_key_col).distinct( self.location_key_col).limit(limit)): if key not in existing_location_ids and key is not None: logging.info(f"new location {city}, {country}") new_locations.append( dict(id=key, city=city, country=country)) existing_location_ids.add(key) if new_locations: logging.warning( f"Adding {len(new_locations)} new locations to database") insert_data(self.db_config_env, "mysqldb", self.database, Base, Geographic, new_locations) def _insert_new_locations_no_id(self): """Checks for new city/country combinations and appends them to the geographic data table in mysql IF NO location_key_col IS PROVIDED. """ limit = self.test_limit if self.test else None with db_session(self.engine) as session: existing_location_ids = { i[0] for i in session.query(Geographic.id).all() } new_locations = [] all_locations = {(city, country) for city, country in (session.query( self.city_col, self.country_col).limit(limit)) } nulls = [] for city, country in all_locations: if self.country_is_iso2: country = country_iso_code_to_name(country, iso2=True) if city is None or country is None: nulls.append((city, country)) continue key = generate_composite_key(city, country) if key not in existing_location_ids and key is not None: logging.info(f"new location {city}, {country}") new_locations.append( dict(id=key, city=city, country=country)) existing_location_ids.add(key) if len(nulls) > 0: logging.warning(f"{len(nulls)} locations had a null city or " "country, so won't be processed.") logging.warning(nulls) if new_locations: logging.warning( f"Adding {len(new_locations)} new locations to database") insert_data(self.db_config_env, "mysqldb", self.database, Base, Geographic, new_locations) def _get_uncoded(self): """Identifies all the locations in the geographic data table which have not previously been processed. If there are none to encode an empty list is returned. Returns: (:obj:`list` of :obj:`dict`) records to process """ with db_session(self.engine) as session: uncoded = session.query( Geographic.id, Geographic.city, Geographic.country).filter(Geographic.done == False) uncoded = [u._asdict() for u in uncoded] logging.info(f"{len(uncoded)} locations to geocode") return uncoded def _create_batches(self, uncoded_locations): """Generate batches of records. A small batch is generated if in test mode. Args: uncoded_locations (:obj:`list` of :obj:`dict`): all locations requiring coding Returns: (str): name of each file in the s3 bucket (key) """ batch_size = 50 if self.test else self.batch_size logging.info(f"batch size: {batch_size}") batch = [] for location in uncoded_locations: batch.append(location) if len(batch) == batch_size: yield self._put_batch(batch) batch.clear() # catch any remainder if len(batch) > 0: yield self._put_batch(batch) def _put_batch(self, data): """Writes out a batch of data to s3 as json, so it can be picked up by the batchable task. Args: data (:obj:`list` of :obj:`dict`): a batch of records Returns: (str): name of the file in the s3 bucket (key) """ timestamp = str(time.time()).replace('.', '') filename = ''.join(['geocoding_batch_', timestamp, '.json']) obj = self.s3.Object(self.intermediate_bucket, filename) obj.put(Body=json.dumps(data)) return filename def prepare(self): """Copies any new city/county combinations from the input table into the geographic_data table. All rows which have previously not been processed will be split into batches. Returns: (:obj:`list` of :obj:`dict`) job parameters for each of the batch tasks """ # set up database connectors self.database = 'dev' if self.test else 'production' self.engine = get_mysql_engine(self.db_config_env, "mysqldb", self.database) try_until_allowed(Base.metadata.create_all, self.engine) # s3 setup self.s3 = boto3.resource('s3') # identify new locations in the input table and copy them to the geographic table if self.location_key_col is not None: self._insert_new_locations() else: self._insert_new_locations_no_id() # create batches from all locations which have not previously been coded job_params = [] uncoded_locations = self._get_uncoded() if uncoded_locations: for batch_file in self._create_batches(uncoded_locations): params = { "batch_file": batch_file, "config": 'mysqldb.config', "db_name": self.database, "bucket": self.intermediate_bucket, "done": False, "outinfo": '', "test": self.test } job_params.append(params) logging.info(params) logging.info(f"{len(job_params)} batches to run") else: logging.warning(f"no new locations to geocode") return job_params def combine(self, job_params): '''Touch the checkpoint''' self.output().touch()
doc_type=es_type, id=row.pop('rcn'), body=row) if not count % 1000: logging.info(f"{count} rows loaded to " "elasticsearch") if __name__ == "__main__": set_log_level() if 'BATCHPAR_outinfo' not in os.environ: from nesta.core.orms.orm_utils import setup_es from nesta.core.luigihacks.misctools import find_filepath_from_pathstub es, es_config = setup_es('dev', True, True, dataset='cordis-eu') environ = { 'config': find_filepath_from_pathstub('mysqldb.config'), 'batch_file': ('cordis-eu_EURITO-ElasticsearchTask-' '2020-04-10-True-15865345336407135.json'), 'db_name': 'dev', 'bucket': 'nesta-production-intermediate', 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], 'out_type': es_config['type'], 'aws_auth_region':