예제 #1
0
    def requires(self):
        logging.getLogger().setLevel(logging.INFO)

        db_config = misctools.get_config("mysqldb.config", "mysqldb")
        db_config["database"] = "production" if self.production else "dev"
        db_config["table"] = "worldbank_countries"

        variable_codes = [
            "SP.RUR.TOTL.ZS", "SP.URB.TOTL.IN.ZS"
            "SP.POP.DPND", "SP.POP.TOTL", "SP.DYN.LE00.IN", "SP.DYN.IMRT.IN",
            "BAR.NOED.25UP.ZS", "BAR.TER.CMPT.25UP.ZS", "NYGDPMKTPSAKD",
            "SI.POV.NAHC", "SI.POV.GINI"
        ]

        job_name = (f"Worldbank-{self.date}-"
                    f"{'_'.join(variable_codes).replace('.','_')}-"
                    f"{self.production}")[0:120]

        yield WorldbankTask(
            date=self.date,
            db_config=db_config,
            variable_codes=variable_codes,
            batchable=find_filepath_from_pathstub(
                "core/batchables/collect_worldbank/"),
            env_files=[
                find_filepath_from_pathstub("/nesta/nesta"),
                find_filepath_from_pathstub("/config/mysqldb.config")
            ],
            job_def="py36_amzn1_image",
            job_name=job_name,
            job_queue="HighPriority",
            region_name="eu-west-2",
            poll_time=10,
            max_live_jobs=200,
            test=(not self.production))
예제 #2
0
    def requires(self):
        '''Collects the database configurations and executes the central task.'''

        logging.getLogger().setLevel(logging.INFO)
        yield InvestorGeocodeTask(
            date=self.date,
            _routine_id=self._routine_id,
            test=self.test,
            db_config_env="MYSQLDB",
            city_col=Investor.city,
            country_col=Investor.country,
            location_key_col=Investor.location_id,
            insert_batch_size=self.insert_batch_size,
            env_files=[
                find_filepath_from_pathstub("nesta/nesta/"),
                find_filepath_from_pathstub("config/mysqldb.config"),
                find_filepath_from_pathstub("config/crunchbase.config")
            ],
            job_def="py36_amzn1_image",
            job_name=f"CrunchBaseInvestorGeocodeTask-{self._routine_id}",
            job_queue="HighPriority",
            region_name="eu-west-2",
            poll_time=10,
            memory=4096,
            max_live_jobs=2)
예제 #3
0
    def requires(self):
        '''Collects the database configurations and executes the central task.'''
        logging.getLogger().setLevel(logging.INFO)
        _routine_id = f"{self.date}-{self.iso2}-{self.category}-{self.production}"

        engine = get_mysql_engine("MYSQLDB", "mysqldb",
                                  "production" if self.production else "dev")
        Base.metadata.create_all(engine)

        yield GroupDetailsTask(
            iso2=self.iso2,
            category=self.category,
            _routine_id=_routine_id,
            batchable=BATCHABLE.format("group_details"),
            env_files=[
                find_filepath_from_pathstub("/nesta/nesta"),
                find_filepath_from_pathstub("/config/mysqldb.config")
            ],
            job_def="py36_amzn1_image",
            job_name="GroupDetails-%s" % _routine_id,
            job_queue="HighPriority",
            region_name="eu-west-2",
            poll_time=10,
            max_live_jobs=100,
            test=(not self.production))
예제 #4
0
 def requires(self):
     '''Collects the database configurations and executes the central task.'''
     yield GtrTask(date=self.date,
                   page_size=self.page_size,
                   batchable=find_filepath_from_pathstub("core/batchables/gtr/collect_gtr"),
                   env_files=[find_filepath_from_pathstub("/nesta/nesta"),
                              find_filepath_from_pathstub("/config/mysqldb.config")],
                   job_def="py36_amzn1_image",
                   job_name=f"GtR-{self.date}-{self.page_size}-{not self.test}",
                   job_queue="HighPriority",
                   region_name="eu-west-2",
                   poll_time=10,
                   test=self.test)
예제 #5
0
def expand_pathstub(pathstub):
    """Expand the pathstub.

    Args:
        pathstub (list or str): A pathstub or list of pathstubs to expand.
    Returns:
        fullpath (list or str): A fullpath or list of fullpaths
    """
    # Expand from a list...
    if type(pathstub) is list:
        return [find_filepath_from_pathstub(_v) for _v in pathstub]
    # ...or from a string (assumed)
    else:
        return find_filepath_from_pathstub(pathstub)
예제 #6
0
    def test_aliases(self):
        """Assert consistency between the aliases and schemas"""
        top_dir = find_filepath_from_pathstub("core/orms")
        all_fields = {}
        for filename in os.listdir(top_dir):
            if not filename.endswith(ES_CONF_SUFFIX):
                continue
            dataset = filename.replace(ES_CONF_SUFFIX, "")
            filename = os.path.join(top_dir, filename)
            with open(filename) as f:
                data = json.load(f)
            print(f'Found {filename}')
            fields = data["mappings"]["_doc"]["properties"].keys()
            all_fields[dataset] = fields

        cwd = os.path.dirname(__file__)
        path = os.path.join(cwd, '../aliases/')
        for filename in os.listdir(path):
            if not filename.endswith(".json"):
                continue
            filename = os.path.join(path, filename)
            for alias, dataset, field in alias_info(filename):
                print("\t", alias, dataset, field)
                assert dataset in all_fields.keys()
                assert field in all_fields[dataset]
예제 #7
0
 def requires(self):
     yield OrgGeocodeTask(date=self.date,
                          _routine_id=self._routine_id,
                          test=self.test,
                          db_config_env="MYSQLDB",
                          city_col=Organization.city,
                          country_col=Organization.country,
                          location_key_col=Organization.location_id,
                          insert_batch_size=self.insert_batch_size,
                          env_files=[find_filepath_from_pathstub("nesta/nesta/"),
                                     find_filepath_from_pathstub("config/mysqldb.config")],
                          job_def="py36_amzn1_image",
                          job_name=f"CrunchBaseOrgGeocodeTask-{self._routine_id}",
                          job_queue="HighPriority",
                          region_name="eu-west-2",
                          poll_time=10,
                          memory=4096,
                          max_live_jobs=2)
예제 #8
0
 def requires(self):
     '''Collects the database configurations and executes the central task.'''
     logging.getLogger().setLevel(logging.INFO)        
     yield GtrTask(date=self.date,
                   page_size=self.page_size,
                   batchable=find_filepath_from_pathstub("core/batchables/gtr/collect_gtr"),
                   env_files=[find_filepath_from_pathstub("/nesta/nesta"),
                              find_filepath_from_pathstub("/config/mysqldb.config")],
                   job_def="py36_amzn1_image",
                   job_name=f"GtR-{self.date}-{self.page_size}-{self.production}",
                   #job_queue="HighPriority",
                   job_queue="MinimalCpus",
                   region_name="eu-west-2",
                   vcpus=2,
                   poll_time=10,
                   memory=2048,
                   max_live_jobs=50,
                   test=(not self.production))
예제 #9
0
def get_gss_codes(test=False):
    """Get all UK geography codes.

    Returns:
        List of UK geography codes.
    """
    CONFIG = find_filepath_from_pathstub("fetch_geography_codes.sparql")
    n = 1 if test else None
    with open(CONFIG) as f:
        query = f.read().replace("\n", " ")
    data = sparql_query(ENDPOINT, query, batch_limit=n)
    return [row["area_code"] for batch in data for row in batch]
예제 #10
0
def run():
    '''Gets the name and age of the muppet, and increments the age.
    The result is transferred to S3.'''

    # Get parameters for the batch job
    dataset = os.environ["BATCHPAR_dataset"]
    start_index = int(os.environ["BATCHPAR_start_index"])
    end_index = int(os.environ["BATCHPAR_end_index"])
    age_increment = int(os.environ["BATCHPAR_age_increment"])
    
    es_host = os.environ["BATCHPAR_outinfo"]
    es_port = os.environ["BATCHPAR_out_port"]
    es_index = os.environ["BATCHPAR_out_index"]
    es_type = os.environ["BATCHPAR_out_type"]
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # Get the input data and modify it based on the input parameters
    data = silo.get("example")[start_index:end_index]
    for row in data:
        row["age"] = row["age"] + age_increment
    
    # Connect to ES    
    field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/",
                                                 "example.json")    
    strans_fpath = find_filepath_from_pathstub("tier_1/schema_transformations/"
                                               "example.json")    
    strans_kwargs={'filename':strans_fpath,
                   'from_key':'tier_0',
                   'to_key':'tier_1',
                   'ignore':['id']}

    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           field_null_mapping=field_null_mapping,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           country_detection=False,
                           caps_to_camel_case=True)

    for uid, row in enumerate(data):
        uid = uid + start_index
        es.index(index=es_index, 
                 doc_type=es_type, id=uid, body=row)

    # Also upload the data to S3
    silo.put(data, dataset)
예제 #11
0
    def requires(self):
        '''Collects the database configurations and executes the central task.'''

        logging.getLogger().setLevel(logging.INFO)
        yield NonOrgCollectTask(
            date=self.date,
            _routine_id=self._routine_id,
            test=self.test,
            db_config_path=find_filepath_from_pathstub("mysqldb.config"),
            insert_batch_size=self.insert_batch_size,
            batchable=find_filepath_from_pathstub(
                "batchables/crunchbase/crunchbase_collect"),
            env_files=[
                find_filepath_from_pathstub("nesta/nesta/"),
                find_filepath_from_pathstub("config/mysqldb.config"),
                find_filepath_from_pathstub("config/crunchbase.config")
            ],
            job_def="py36_amzn1_image",
            job_name=f"CrunchBaseNonOrgCollectTask-{self._routine_id}",
            job_queue="HighPriority",
            region_name="eu-west-2",
            poll_time=10,
            memory=4096,
            max_live_jobs=20)
예제 #12
0
파일: orm_utils.py 프로젝트: yitzikc/nesta
def load_json_from_pathstub(pathstub, filename, sort_on_load=True):
    """Basic wrapper around :obj:`find_filepath_from_pathstub`
    which also opens the file (assumed to be json).

    Args:
        pathstub (str): Stub of filepath where the file should be found.
        filename (str): The filename.
    Returns:
        The file contents as a json object.
    """
    _path = find_filepath_from_pathstub(pathstub)
    _path = os.path.join(_path, filename)
    with open(_path) as f:
        js = json.load(f)
    if sort_on_load:
        _js = json.dumps(js, sort_keys=True)
        js = json.loads(_js)
    return js
예제 #13
0
# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath('.'))
sys.path.insert(0, os.path.abspath('../../eurito_daps/'))
sys.path.insert(0, os.path.abspath('../../eurito_daps/core'))

from nesta.core.luigihacks.misctools import find_filepath_from_pathstub
try:
    find_filepath_from_pathstub('luigi.cfg')
except FileNotFoundError:
    config_dir = 'core/config'
    os.makedirs(config_dir)
    with open(os.path.join(config_dir, 'luigi.cfg'), 'w') as f:
        f.write('[worker]\nx=1')
    with open(os.path.join(config_dir, 'mysqldb.config'), 'w') as f:
        f.write('[mysqldb]\nx=1')
    print(os.listdir(config_dir))

# -- Project information -----------------------------------------------------

project = 'eurito'
copyright = '2019, EURITO'
author = 'EURITO'
예제 #14
0
파일: run.py 프로젝트: yitzikc/nesta
        for calls, project_calls in split_links(_calls, rcn):
            data['proposal_calls'].append(calls)
            data['project_proposal_calls'].append(project_calls)

    # Pipe the data to the db
    for table_prefix, rows in data.items():
        table_name = f'cordis_{table_prefix}'
        logging.info(table_name)
        _class = get_class_by_tablename(Base, table_name)
        insert_data(db_env,
                    db_section,
                    db_name,
                    Base,
                    _class,
                    rows,
                    low_memory=True)


if __name__ == "__main__":

    set_log_level(True)
    if 'BATCHPAR_config' not in os.environ:
        from nesta.core.luigihacks.misctools import find_filepath_from_pathstub
        os.environ['BATCHPAR_batch_file'] = (
            'Cordis-2020-04-12-True-1586709686976328.json')
        os.environ['BATCHPAR_db_name'] = 'production'
        os.environ["BATCHPAR_config"] = find_filepath_from_pathstub(
            'mysqldb.config'),
        os.environ["BATCHPAR_bucket"] = ('nesta-production' '-intermediate')
    run()
예제 #15
0
def mock_response():
    test_file = find_filepath_from_pathstub('mocked_arxiv_response.xml')
    with open(test_file, mode='rb') as f:
        return f.read()
예제 #16
0
 def test_find_filepath_from_pathstub(self):
     find_filepath_from_pathstub("nesta/packages")
     with self.assertRaises(FileNotFoundError):
         find_filepath_from_pathstub("nesta/package")
예제 #17
0
from nesta.core.orms.orm_utils import get_mysql_engine
import luigi
import datetime
import json
import time
import logging
from botocore.errorfactory import ClientError
import boto3
import os

# Define these globally since they are shared resources
# TODO: consider bundling this into a Singleton
S3 = boto3.resource('s3')
_BUCKET = S3.Bucket("nesta-production-intermediate")
DONE_KEYS = set(obj.key for obj in _BUCKET.objects.all())
BATCHABLE = os.path.join(find_filepath_from_pathstub("core/batchables/meetup"),
                         "{}")


def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


class CountryGroupsTask(autobatch.AutoBatchTask):
    '''Extract all groups with corresponding category for this country.

    Args:
    
    
예제 #18
0
class GeocodeBatchTask(AutoBatchTask):
    """Appends various geographic codes to the geographic_data table using the
    `city` and `country` from the input table: lat/long, iso codes, continent.

    To implement this task, only the `output` and `combine` methods need to be defined
    when it is subclassed.

    Args:
        test (bool): in test or production mode
        db_config_env (str): environmental variable pointing to the db config file
        city_col (:obj:`sqlalchemy.Column`): column containing the city
        country_col (:obj:`sqlalchemy.Column`): column containing the full name of the country
        location_key_col (:obj:`sqlalchemy.Column`): column containing the generated composite key
        batch_size (int): number of locations to geocode in a batch
        intermediate_bucket (str): s3 bucket where the batch data will be stored
        batchable (str): location of the batchable run.py
    """
    test = luigi.BoolParameter()
    _routine_id = luigi.Parameter(default="DUMMY ROUTINE")
    db_config_env = luigi.Parameter()
    city_col = luigi.Parameter()
    country_col = luigi.Parameter()
    country_is_iso2 = luigi.BoolParameter(default=False)
    location_key_col = luigi.Parameter(default=None)
    batch_size = luigi.IntParameter(default=1000)
    intermediate_bucket = luigi.Parameter(
        default="nesta-production-intermediate")
    batchable = luigi.Parameter(
        default=find_filepath_from_pathstub("batchables/batchgeocode"))
    test_limit = luigi.IntParameter(default=100)

    def output(self):
        '''Points to the output database engine'''
        db_config = get_config(os.environ[self.db_config_env], "mysqldb")
        db_config["database"] = 'dev' if self.test else 'production'
        db_config[
            "table"] = f"BatchGeocode{self._routine_id} <dummy>"  # Note, not a real table
        return MySqlTarget(update_id=f"BatchGeocode-{self._routine_id}",
                           **db_config)

    def _insert_new_locations(self):
        """Checks for new city/country combinations and appends them to the geographic
        data table in mysql.
        """
        limit = self.test_limit if self.test else None
        with db_session(self.engine) as session:
            existing_location_ids = {
                i[0]
                for i in session.query(Geographic.id).all()
            }
            new_locations = []
            for city, country, key in (session.query(
                    self.city_col, self.country_col,
                    self.location_key_col).distinct(
                        self.location_key_col).limit(limit)):
                if key not in existing_location_ids and key is not None:
                    logging.info(f"new location {city}, {country}")
                    new_locations.append(
                        dict(id=key, city=city, country=country))
                    existing_location_ids.add(key)

        if new_locations:
            logging.warning(
                f"Adding {len(new_locations)} new locations to database")
            insert_data(self.db_config_env, "mysqldb", self.database, Base,
                        Geographic, new_locations)

    def _insert_new_locations_no_id(self):
        """Checks for new city/country combinations and appends them to the geographic
        data table in mysql IF NO location_key_col IS PROVIDED.
        """
        limit = self.test_limit if self.test else None
        with db_session(self.engine) as session:
            existing_location_ids = {
                i[0]
                for i in session.query(Geographic.id).all()
            }
            new_locations = []
            all_locations = {(city, country)
                             for city, country in (session.query(
                                 self.city_col, self.country_col).limit(limit))
                             }
            nulls = []
            for city, country in all_locations:
                if self.country_is_iso2:
                    country = country_iso_code_to_name(country, iso2=True)
                if city is None or country is None:
                    nulls.append((city, country))
                    continue
                key = generate_composite_key(city, country)
                if key not in existing_location_ids and key is not None:
                    logging.info(f"new location {city}, {country}")
                    new_locations.append(
                        dict(id=key, city=city, country=country))
                    existing_location_ids.add(key)

        if len(nulls) > 0:
            logging.warning(f"{len(nulls)} locations had a null city or "
                            "country, so won't be processed.")
            logging.warning(nulls)
        if new_locations:
            logging.warning(
                f"Adding {len(new_locations)} new locations to database")
            insert_data(self.db_config_env, "mysqldb", self.database, Base,
                        Geographic, new_locations)

    def _get_uncoded(self):
        """Identifies all the locations in the geographic data table which have not
        previously been processed. If there are none to encode an empty list is
        returned.

        Returns:
            (:obj:`list` of :obj:`dict`) records to process
        """
        with db_session(self.engine) as session:
            uncoded = session.query(
                Geographic.id, Geographic.city,
                Geographic.country).filter(Geographic.done == False)
            uncoded = [u._asdict() for u in uncoded]
        logging.info(f"{len(uncoded)} locations to geocode")
        return uncoded

    def _create_batches(self, uncoded_locations):
        """Generate batches of records. A small batch is generated if in test mode.

        Args:
            uncoded_locations (:obj:`list` of :obj:`dict`): all locations requiring coding

        Returns:
            (str): name of each file in the s3 bucket (key)
        """
        batch_size = 50 if self.test else self.batch_size
        logging.info(f"batch size: {batch_size}")
        batch = []
        for location in uncoded_locations:
            batch.append(location)
            if len(batch) == batch_size:
                yield self._put_batch(batch)
                batch.clear()
        # catch any remainder
        if len(batch) > 0:
            yield self._put_batch(batch)

    def _put_batch(self, data):
        """Writes out a batch of data to s3 as json, so it can be picked up by the
        batchable task.

        Args:
            data (:obj:`list` of :obj:`dict`): a batch of records

        Returns:
            (str): name of the file in the s3 bucket (key)
        """
        timestamp = str(time.time()).replace('.', '')
        filename = ''.join(['geocoding_batch_', timestamp, '.json'])
        obj = self.s3.Object(self.intermediate_bucket, filename)
        obj.put(Body=json.dumps(data))
        return filename

    def prepare(self):
        """Copies any new city/county combinations from the input table into the
        geographic_data table. All rows which have previously not been processed will
        be split into batches.

        Returns:
            (:obj:`list` of :obj:`dict`) job parameters for each of the batch tasks
        """
        # set up database connectors
        self.database = 'dev' if self.test else 'production'
        self.engine = get_mysql_engine(self.db_config_env, "mysqldb",
                                       self.database)
        try_until_allowed(Base.metadata.create_all, self.engine)

        # s3 setup
        self.s3 = boto3.resource('s3')

        # identify new locations in the input table and copy them to the geographic table
        if self.location_key_col is not None:
            self._insert_new_locations()
        else:
            self._insert_new_locations_no_id()

        # create batches from all locations which have not previously been coded
        job_params = []
        uncoded_locations = self._get_uncoded()
        if uncoded_locations:
            for batch_file in self._create_batches(uncoded_locations):
                params = {
                    "batch_file": batch_file,
                    "config": 'mysqldb.config',
                    "db_name": self.database,
                    "bucket": self.intermediate_bucket,
                    "done": False,
                    "outinfo": '',
                    "test": self.test
                }
                job_params.append(params)
                logging.info(params)
            logging.info(f"{len(job_params)} batches to run")
        else:
            logging.warning(f"no new locations to geocode")

        return job_params

    def combine(self, job_params):
        '''Touch the checkpoint'''
        self.output().touch()
예제 #19
0
파일: run.py 프로젝트: yitzikc/nesta
                     doc_type=es_type,
                     id=row.pop('rcn'),
                     body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to " "elasticsearch")


if __name__ == "__main__":
    set_log_level()
    if 'BATCHPAR_outinfo' not in os.environ:
        from nesta.core.orms.orm_utils import setup_es
        from nesta.core.luigihacks.misctools import find_filepath_from_pathstub
        es, es_config = setup_es('dev', True, True, dataset='cordis-eu')
        environ = {
            'config':
            find_filepath_from_pathstub('mysqldb.config'),
            'batch_file': ('cordis-eu_EURITO-ElasticsearchTask-'
                           '2020-04-10-True-15865345336407135.json'),
            'db_name':
            'dev',
            'bucket':
            'nesta-production-intermediate',
            'outinfo':
            es_config['host'],
            'out_port':
            es_config['port'],
            'out_index':
            es_config['index'],
            'out_type':
            es_config['type'],
            'aws_auth_region':