Пример #1
0
    def run(self):
        """Apply health labels using model."""
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        try_until_allowed(Base.metadata.create_all, self.engine)

        # collect and unpickle models from s3
        logging.info("Collecting models from S3")
        s3 = boto3.resource('s3')
        vectoriser_obj = s3.Object(self.bucket, self.vectoriser_key)
        vectoriser = pickle.loads(
            vectoriser_obj.get()['Body']._raw_stream.read())
        classifier_obj = s3.Object(self.bucket, self.classifier_key)
        classifier = pickle.loads(
            classifier_obj.get()['Body']._raw_stream.read())

        # retrieve organisations and categories
        nrows = 1000 if self.test else None
        logging.info("Collecting organisations from database")
        with db_session(self.engine) as session:
            orgs = (session.query(Organization.id).filter(
                Organization.is_health.is_(None)).limit(nrows).all())

        for batch_count, batch in enumerate(
                split_batches(orgs, self.insert_batch_size), 1):
            batch_orgs_with_cats = []
            for (org_id, ) in batch:
                with db_session(self.engine) as session:
                    categories = (session.query(
                        OrganizationCategory.category_name).filter(
                            OrganizationCategory.organization_id ==
                            org_id).all())
                # categories should be a list of str, comma separated: ['cat,cat,cat', 'cat,cat']
                categories = ','.join(cat_name for (cat_name, ) in categories)
                batch_orgs_with_cats.append({
                    'id': org_id,
                    'categories': categories
                })

            logging.debug(
                f"{len(batch_orgs_with_cats)} organisations retrieved from database"
            )

            logging.debug("Predicting health flags")
            batch_orgs_with_flag = predict_health_flag(batch_orgs_with_cats,
                                                       vectoriser, classifier)

            logging.debug(
                f"{len(batch_orgs_with_flag)} organisations to update")
            with db_session(self.engine) as session:
                session.bulk_update_mappings(Organization,
                                             batch_orgs_with_flag)
            logging.info(
                f"{batch_count} batches health labeled and written to db")

        # mark as done
        logging.warning("Task complete")
        self.output().touch()
Пример #2
0
    def run(self):
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        # collect mesh terms from S3
        bucket = 'innovation-mapping-general'
        key = 'crunchbase_descriptions/crunchbase_descriptions_mesh.txt'
        mesh_terms = retrieve_mesh_terms(bucket, key)
        mesh_terms = format_mesh_terms(
            mesh_terms)  # [{'id': ['term1', 'term2']}, ...]
        logging.info(f"File contains {len(mesh_terms)} orgs with mesh terms")

        logging.info("Extracting previously processed orgs")
        with db_session(self.engine) as session:
            all_orgs = session.query(Organization.id,
                                     Organization.mesh_terms).all()
        processed_orgs = {
            org_id
            for (org_id, mesh_terms) in all_orgs if mesh_terms is not None
        }
        all_orgs = {org_id for (org_id, _) in all_orgs}
        logging.info(f"{len(all_orgs)} total orgs in database")
        logging.info(f"{len(processed_orgs)} previously processed orgs")

        # reformat for batch insert, removing not found and previously processed terms
        meshed_orgs = [{
            'id': org_id,
            'mesh_terms': '|'.join(terms)
        } for org_id, terms in mesh_terms.items()
                       if org_id in all_orgs and org_id not in processed_orgs]

        logging.info(f"{len(meshed_orgs)} organisations to update in database")

        for count, batch in enumerate(
                split_batches(meshed_orgs, self.insert_batch_size), 1):
            with db_session(self.engine) as session:
                session.bulk_update_mappings(Organization, batch)
            logging.info(
                f"{count} batch{'es' if count > 1 else ''} written to db")
            if self.test and count > 1:
                logging.info("Breaking after 2 batches while in test mode")
                break

        # mark as done
        logging.warning("Task complete")
        self.output().touch()
Пример #3
0
def update_field_of_study_ids_sparql(engine, fos_ids):
    """Queries MAG via the sparql api for fields of study and if found, adds them to the
    database. Only ids of missing fields of study should be supplied, no check is done
    here to determine if it already exists.

    Args:
        engine (:obj:`sqlalchemy.engine`): database connection
        fos_ids (list): ids to search and update

    Returns:
        (set): ids which could not be found in MAG
    """
    logging.info(f"Querying MAG for {len(fos_ids)} missing fields of study")
    new_fos_to_import = [
        FieldOfStudy(**fos) for fos in query_fields_of_study_sparql(fos_ids)
    ]

    logging.info(
        f"Retrieved {len(new_fos_to_import)} new fields of study from MAG")
    fos_not_found = set(fos_ids) - {fos.id for fos in new_fos_to_import}
    if fos_not_found:
        logging.warning(
            f"Fields of study present in articles but could not be found in MAG Fields of Study database: {fos_not_found}"
        )
    with db_session(engine) as session:
        session.add_all(new_fos_to_import)
        session.commit()
    logging.info("Added new fields of study to database")
    return fos_not_found
Пример #4
0
def get_article_ids_by_term(engine, term, min_weight):
    """Identifies articles related to a term.

    The topic id is collected from sql and then a list of article ids is returned.

    Args:
        engine (:code:`sqlalchemy.engine`): connection to the database
        term (str): term to search for
        min_weight (float): minimum acceptable weight for matches

    Returns:
        (set): ids of articles which are in that topic, at or above the specified weight
    """
    with db_session(engine) as session:
        topic_id = (session.query(CorExTopic.id).filter(
            sqlalchemy.func.json_contains(CorExTopic.terms,
                                          f'["{term}"]')).scalar())
        if topic_id is None:
            raise ValueError(f'{term} not found in any topics')
        logging.info(f"Identified {term} topic with id {topic_id}")

        articles = (session.query(ArticleTopic.article_id).filter(
            (ArticleTopic.topic_id == topic_id)
            & (ArticleTopic.topic_weight >= min_weight)).all())

    article_ids = {a.article_id for a in articles}
    logging.info(
        f"Identified {len(article_ids)} deep learning articles in database")

    return article_ids
Пример #5
0
    def _insert_new_locations(self):
        """Checks for new city/country combinations and appends them to the geographic
        data table in mysql.
        """
        limit = self.test_limit if self.test else None
        with db_session(self.engine) as session:
            existing_location_ids = {
                i[0]
                for i in session.query(Geographic.id).all()
            }
            new_locations = []
            for city, country, key in (session.query(
                    self.city_col, self.country_col,
                    self.location_key_col).distinct(
                        self.location_key_col).limit(limit)):
                if key not in existing_location_ids and key is not None:
                    logging.info(f"new location {city}, {country}")
                    new_locations.append(
                        dict(id=key, city=city, country=country))
                    existing_location_ids.add(key)

        if new_locations:
            logging.warning(
                f"Adding {len(new_locations)} new locations to database")
            insert_data(self.db_config_env, "mysqldb", self.database, Base,
                        Geographic, new_locations)
Пример #6
0
def run():
    batch_file = os.environ['BATCHPAR_batch_file']
    db = os.environ['BATCHPAR_db_name']
    bucket = os.environ['BATCHPAR_bucket']

    # database setup
    engine = get_mysql_engine('BATCHPAR_config', 'mysqldb', db)

    # collect data
    target = f"s3://{bucket}/{batch_file}"
    df = pd.read_json(target, orient='records')
    logging.info(f"{len(df)} locations to geocode")

    # append country iso codes and continent
    df = country_iso_code_dataframe(df)
    logging.info("Country ISO codes appended")

    # geocode, appending latitude and longitude columns, using the q= query method
    df = geocode_batch_dataframe(df, query_method='query_only')
    logging.info("Geocoding complete")

    # remove city and country columns and append done column
    df = df.drop(['city', 'country'], axis=1)
    df['done'] = True

    # convert to list of dict and output to database
    rows = df.to_dict(orient='records')

    logging.info(f"Writing {len(rows)} rows to database")
    with db_session(engine) as session:
        session.bulk_update_mappings(Geographic, rows)
    logging.warning("Batch task complete")
Пример #7
0
def get_objects(from_date):
    """Get all arXiv articles from a given start date.

    Args:
        from_date (str, optional): Min article creation date.

    Returns:
        articles (list): List of arXiv article data.
    """
    logging.info(f"Retrieving projects from at least {from_date}")
    engine = get_mysql_engine()
    with db_session(engine) as session:
        query = session.query(
            Project.application_id,
            Project.phr,
            Project.abstract_text,
            Project.project_title,
            Project.project_start,
            Project.total_cost,
        )
        query = query.filter(Project.project_start > from_date)
        projects = [
            dict(
                id=id,
                text=join_text(phr, abstract),
                title=title,
                created=start_date,
                funding=funding,
            )
            for id, phr, abstract, title, start_date, funding in query.all()
            if not ((phr is None) and (abstract is None))
        ]
    return projects
    def run(self):
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        # collect file
        logging.info(f"Collecting org_parents from crunchbase tar")
        org_parents = get_files_from_tar(['org_parents'])[0]
        logging.info(f"{len(org_parents)} parent ids in crunchbase export")

        # collect previously processed orgs
        logging.info("Extracting previously processed organisations")
        with db_session(self.engine) as session:
            processed_orgs = session.query(Organization.id,
                                           Organization.parent_id).all()
        all_orgs = {org for (org, _) in processed_orgs}
        logging.info(f"{len(all_orgs)} total orgs in database")
        processed_orgs = {
            org
            for (org, parent_id) in processed_orgs if parent_id is not None
        }
        logging.info(f"{len(processed_orgs)} previously processed orgs")

        # reformat into a list of dicts, removing orgs that already have a parent_id
        # or are missing from the database
        org_parents = org_parents[['uuid', 'parent_uuid']]
        org_parents.columns = ['id', 'parent_id']
        org_parents = org_parents[org_parents['id'].isin(all_orgs)]
        org_parents = org_parents[~org_parents['id'].isin(processed_orgs)]
        org_parents = org_parents.to_dict(orient='records')
        logging.info(f"{len(org_parents)} organisations to update in MYSQL")

        # insert parent_ids into db in batches
        for count, batch in enumerate(
                split_batches(org_parents, self.insert_batch_size), 1):
            with db_session(self.engine) as session:
                session.bulk_update_mappings(Organization, batch)
            logging.info(
                f"{count} batch{'es' if count > 1 else ''} written to db")
            if self.test and count > 1:
                logging.info("Breaking after 2 batches while in test mode")
                break

        # mark as done
        logging.warning("Task complete")
        self.output().touch()
Пример #9
0
def get_projects():
    logging.info("Retrieving all projects geography")
    engine = get_mysql_engine()
    with db_session(engine) as session:
        query = session.query(
            Project.application_id, Project.coordinates, Project.is_eu, Project.iso2
        )
        return query.all()
Пример #10
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']

    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # database setup
    logging.info('Retrieving engine connection')
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)

    # es setup
    logging.info('Connecting to ES')
    strans_kwargs = {
        'filename': 'eurito/cordis-eu.json',
        'from_key': 'tier_0',
        'to_key': 'tier_1',
        'ignore': ['id']
    }
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           listify_terms=True,
                           do_sort=False,
                           ngram_fields=['textBody_description_project'])

    # collect file
    logging.info('Retrieving project ids')
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    project_ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(project_ids)} project IDs " "retrieved from s3")

    #
    logging.info('Processing rows')
    with db_session(engine) as session:
        for count, obj in enumerate((session.query(Project).filter(
                Project.rcn.in_(project_ids)).all())):
            row = object_to_dict(obj)
            row = reformat_row(row)
            es.index(index=es_index,
                     doc_type=es_type,
                     id=row.pop('rcn'),
                     body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to " "elasticsearch")
Пример #11
0
    def run(self):
        # database setup
        database = 'dev' if self.test else 'production'
        logging.info(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        # s3 setup
        s3 = boto3.resource('s3')
        intermediate_file = s3.Object(BUCKET, f"mag_estimate_{database}.json")

        eu = get_eu_countries()

        with db_session(self.engine) as session:
            eu_grid_ids = {
                i.id
                for i in (session.query(Institute.id).filter(
                    Institute.country.in_(eu)).all())
            }
            logging.info(f"{len(eu_grid_ids):,} EU institutes in GRID")

        # collect previous and exclude
        try:
            previous = json.loads(
                intermediate_file.get()['Body']._raw_stream.read())

            done_institutes = set(previous['institutes'])
            logging.info(
                f"{len(done_institutes)} previously processed institutes retrieved"
            )
            eu_grid_ids = eu_grid_ids - done_institutes
            logging.info(f"{len(eu_grid_ids)} to process")

            paper_ids = set(previous['paper_ids'])
            logging.info(
                f"{len(paper_ids)} previously processed papers retrieved")
        except ClientError:
            logging.info("Unable to load previous file, starting from scratch")
            done_institutes = set()
            paper_ids = set()

        limit = 100 if self.test else None
        save_every = 50 if self.test else 1000000

        total = count_papers(eu_grid_ids,
                             done_institutes,
                             paper_ids,
                             intermediate_file,
                             save_every=save_every,
                             limit=limit)

        # mark as done
        logging.info("Task complete")
        logging.info(f"Total EU papers found: {total:,}")
        self.output().touch()
Пример #12
0
def get_iso2_to_id():
    """
    Fetch and curate a lookup table of ISO2 code to
    the set of article ids for that ISO2 code.
    """
    logging.info("Retrieving projects iso2 lookup")
    engine = get_mysql_engine()
    with db_session(engine) as session:
        query = session.query(Link.project_rcn, Org.country_code)
        query = query.join(Org, Link.organization_id == Org.id)
        query = query.filter(Org.country_code != "")
        return list(query.all())
Пример #13
0
    def run(self):
        """Collect and process organizations, categories and long descriptions."""

        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        try_until_allowed(Base.metadata.create_all, self.engine)
        limit = 2000 if self.test else None
        batch_size = 30 if self.test else 1000

        with db_session(self.engine) as session:
            all_orgs = session.query(
                Organisation.id, Organisation.addresses).limit(limit).all()
            existing_org_location_ids = session.query(
                OrganisationLocation.id).all()
        logging.info(f"{len(all_orgs)} organisations retrieved from database")
        logging.info(
            f"{len(existing_org_location_ids)} organisations have previously been processed"
        )

        # convert to a list of dictionaries with the nested addresses unpacked
        orgs = get_orgs_to_process(all_orgs, existing_org_location_ids)
        logging.info(f"{len(orgs)} new organisations to geocode")

        total_batches = ceil(len(orgs) / batch_size)
        logging.info(f"{total_batches} batches")
        completed_batches = 0
        for batch in split_batches(orgs, batch_size=batch_size):
            # geocode first to add missing country for UK
            batch = map(geocode_uk_with_postcode, batch)
            batch = map(add_country_details, batch)

            # remove data not in OrganisationLocation columns
            org_location_cols = OrganisationLocation.__table__.columns.keys()
            batch = [{k: v
                      for k, v in org.items() if k in org_location_cols}
                     for org in batch]

            insert_data(self.db_config_env, 'mysqldb', database, Base,
                        OrganisationLocation, batch)
            completed_batches += 1
            logging.info(
                f"Completed {completed_batches} of {total_batches} batches")

            if self.test and completed_batches > 1:
                logging.warning("Breaking after 2 batches in test mode")
                break

        # mark as done
        logging.warning("Finished task")
        self.output().touch()
Пример #14
0
    def run(self):
        db = 'production' if not self.test else 'dev'

        keys = self.get_abstract_file_keys(bucket, key_prefix)
        
        engine = get_mysql_engine(self.db_config_env, 'mysqldb', db)
        with db_session(engine) as session:
            
            if self.test:
                existing_projects = set()
                projects = session.query(Projects.application_id).distinct()
                for p in projects:
                    existing_projects.update(int(p.application_id))
            
            projects_done = set()
            projects_mesh = session.query(ProjectMeshTerms.project_id).distinct()
            for p in projects_mesh:
                projects_done.update(int(p.project_id))
            
            mesh_term_ids = {int(m.id) for m in session.query(MeshTerms.id).all()}

        logging.info('Inserting associations')
        
        for key_count, key in enumerate(keys):
            if self.test and (key_count > 2):
                continue
            # collect mesh results from s3 file and groups by project id
            # each project id has set of mesh terms and corresponding term ids
            df_mesh = retrieve_mesh_terms(bucket, key)
            project_terms = self.format_mesh_terms(df_mesh)
            # go through documents
            for project_count, (project_id, terms) in enumerate(project_terms.items()):
                rows = []
                if self.test and (project_count > 2):
                    continue
                if (project_id in projects_done) or (project_id not in existing_projects):
                    continue

                for term, term_id in zip(terms['terms'], terms['ids']):
                    term_id = int(term_id)
                    # add term to mesh term table if not present
                    if term_id not in mesh_term_ids:
                        objs = insert_data(
                                self.db_config_env, 'mysqldb', db, Base, MeshTerms, 
                                [{'id': term_id, 'term': term}], low_memory=True)
                        mesh_term_ids.update({term_id})
                    # prepare row to be added to project-mesh_term link table
                    rows.append({'project_id': project_id, 'mesh_term_id': term_id})
                # inesrt rows to link table
                insert_data(self.db_config_env, 'mysqldb', db, Base, 
                        ProjectMeshTerms, rows, low_memory=True)
        self.output().touch() # populate project-mesh_term link table
Пример #15
0
def all_article_ids(engine, limit=None):
    """Retrieve the id of every article from MYSQL.

    Args:
        engine (:obj:`sqlalchemy.engine.Base.Engine`): db connectable.
        limit (int): row limit to apply to query (e.g. for testing)

    Returns:
        (set): all article ids
    """
    with db_session(engine) as session:
        arts = session.query(Article.id).limit(limit)
        return {art.id for art in arts}
Пример #16
0
def get_iso2_to_id():
    """
    Fetch and curate a lookup table of ISO2 code to
    the set of article ids for that ISO2 code.
    """
    logging.info("Retrieving articles iso2 lookup")
    engine = get_mysql_engine()
    with db_session(engine) as session:
        q = session.query(Link.article_id, Inst.country_code)
        q = q.join(Link, Link.institute_id == Inst.id, isouter=True)
        for field in (Inst.id, Link.article_id):
            q = q.filter(field.isnot(None))
        return list(q.all())
Пример #17
0
def get_members_by_percentile(engine, perc=10):
    """Get the number of meetup group members for a given percentile
    from the database.
    
    Args:
        engine: A SQL alchemy connectable.
        perc (int): A percentile to evaluate.
    Returns:
        members (float): The number of members corresponding to this percentile.
    """
    with db_session(engine) as session:
        rows = (session.query(Group.members).all())
        rows = [r.members for r in rows]
    return float(np.percentile(rows, perc))
Пример #18
0
    def _get_uncoded(self):
        """Identifies all the locations in the geographic data table which have not
        previously been processed. If there are none to encode an empty list is
        returned.

        Returns:
            (:obj:`list` of :obj:`dict`) records to process
        """
        with db_session(self.engine) as session:
            uncoded = session.query(
                Geographic.id, Geographic.city,
                Geographic.country).filter(Geographic.done == False)
            uncoded = [u._asdict() for u in uncoded]
        logging.info(f"{len(uncoded)} locations to geocode")
        return uncoded
Пример #19
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    db_name = os.environ["BATCHPAR_db_name"]
    batch_size = int(os.environ["BATCHPAR_batch_size"])  # example parameter
    s3_path = os.environ["BATCHPAR_outinfo"]
    start_string = os.environ["BATCHPAR_start_string"],  # example parameter
    offset = int(os.environ["BATCHPAR_offset"])

    # reduce records in test mode
    if test:
        limit = 50
        logging.info(f"Limiting to {limit} rows in test mode")
    else:
        limit = batch_size

    logging.info(f"Processing {offset} - {offset + limit}")

    # database setup
    logging.info(f"Using {db_name} database")
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    try_until_allowed(Base.metadata.create_all, engine)

    with db_session(engine) as session:
        # consider moving this query and the one from the prepare step into a package
        batch_records = (session.query(MyTable.id, MyTable.name).filter(
            MyTable.founded_on > '2007-01-01').offset(offset).limit(limit))

    # process and insert data
    processed_batch = []
    for row in batch_records:
        processed_row = some_func(start_string=start_string, row=row)
        processed_batch.append(processed_row)

    logging.info(f"Inserting {len(processed_batch)} rows")
    insert_data("BATCHPAR_config",
                'mysqldb',
                db_name,
                Base,
                MyOtherTable,
                processed_batch,
                low_memory=True)

    logging.info(f"Marking task as done to {s3_path}")
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

    logging.info("Batch job complete.")
Пример #20
0
    def test_object_to_dict(self):
        parents = [{
            "_id": 10,
            "_another_id": 2,
            "some_field": 20
        }, {
            "_id": 20,
            "_another_id": 2,
            "some_field": 20
        }]
        _parents = insert_data("MYSQLDBCONF", "mysqldb", "production_tests",
                               Base, DummyModel, parents)
        assert len(parents) == len(_parents)

        children = [{
            "_id": 10,
            "parent_id": 10
        }, {
            "_id": 10,
            "parent_id": 20
        }, {
            "_id": 20,
            "parent_id": 20
        }, {
            "_id": 30,
            "parent_id": 20
        }]
        _children = insert_data("MYSQLDBCONF", "mysqldb", "production_tests",
                                Base, DummyChild, children)
        assert len(children) == len(_children)

        # Re-retrieve parents from the database
        found_children = set()
        engine = get_mysql_engine("MYSQLDBCONF", "mysqldb")
        with db_session(engine) as session:
            for p in session.query(DummyModel).all():
                row = object_to_dict(p)
                assert type(row) is dict
                assert len(row['children']) > 0
                _found_children = set(
                    (c['_id'], c['parent_id']) for c in row['children'])
                found_children = found_children.union(_found_children)
                _row = object_to_dict(p, shallow=True)
                assert 'children' not in _row
                del row['children']
                assert row == _row
            assert len(found_children) == len(children) == len(_children)
Пример #21
0
    def prepare(self):
        """Prepare the batch job parameters"""
        # database setup
        database = 'dev' if self.test else 'production'
        logging.info(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        if self.test and database == 'dev':
            logging.warning('Dropping tables')
            Base.metadata.drop_all(self.engine)
        try_until_allowed(Base.metadata.create_all, self.engine)

        with db_session(self.engine) as session:
            if self.test:
                logging.info("Adding test data")
                test_data = []
                for i in range(1000):
                    test_data.append(MyTable(id=i, founded_on='2009-01-01'))
                session.add_all(test_data)
                session.commit()

            logging.info('Retrieving list of records to process')
            total_records = (session
                             .query(MyTable.id)
                             .filter(MyTable.founded_on > '2007-01-01')
                             .count())

        job_params = []  # dictionaries of environmental variables for each batch
        # potential method of generating batches:
        for count, offset in enumerate(range(0, total_records, self.batch_size)):
            key = f"{self.date}_batch_{offset}_{database}"
            done = key in DONE_KEYS
            params = {"config": "mysqldb.config",
                      "db_name": database,
                      "test": self.test,
                      "outinfo": f"s3://{self.intermediate_bucket}/{key}",
                      "done": done,
                      "batch_size": self.batch_size,  # example parameter
                      "start_string": self.start_string,  # example parameter
                      "offset": offset}
            job_params.append(params)
            logging.info(params)

            if self.test and count == TEST_BATCHES:
                logging.info(f"Only {TEST_BATCHES} batches created in test mode")
                break

        return job_params
Пример #22
0
def build_fos_lookup(engine, max_lvl=2):
    """A FoS lookup table of IDs to names
    
    Args:
        engine (sqlalchemy.engine): Database engine.
        max_lvl (int): Maximum FoS level to consider
    Returns:
        fos_lookup (dict): Lookup of IDs to names, where keys and values are (parent, child) pairs.
    """
    with db_session(engine) as session:
        fos = [
            f.__dict__
            for f in (session.query(FoS).filter(FoS.level <= max_lvl).all())
        ]
    fos_children = {f['id']: split_ids(f['child_ids']) for f in fos}
    fos_names = {f['id']: f['name'] for f in fos}
    return {(pid, cid): [fos_names[pid], fos_names[cid]]
            for pid, children in fos_children.items() for cid in children
            if cid in fos_children}
Пример #23
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    db_name = os.environ["BATCHPAR_db_name"]
    table = os.environ["BATCHPAR_table"]
    batch_size = int(os.environ["BATCHPAR_batch_size"])
    s3_path = os.environ["BATCHPAR_outinfo"]

    logging.warning(f"Processing {table} file")

    # database setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    try_until_allowed(Base.metadata.create_all, engine)
    table_name = f"crunchbase_{table}"
    table_class = get_class_by_tablename(Base, table_name)

    # collect file
    nrows = 1000 if test else None
    df = get_files_from_tar([table], nrows=nrows)[0]
    logging.warning(f"{len(df)} rows in file")

    # get primary key fields and set of all those already existing in the db
    pk_cols = list(table_class.__table__.primary_key.columns)
    pk_names = [pk.name for pk in pk_cols]
    with db_session(engine) as session:
        existing_rows = set(session.query(*pk_cols).all())

    # process and insert data
    processed_rows = process_non_orgs(df, existing_rows, pk_names)
    for batch in split_batches(processed_rows, batch_size):
        insert_data("BATCHPAR_config",
                    'mysqldb',
                    db_name,
                    Base,
                    table_class,
                    processed_rows,
                    low_memory=True)

    logging.warning(f"Marking task as done to {s3_path}")
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

    logging.warning("Batch job complete.")
Пример #24
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    db_name = os.environ["BATCHPAR_db_name"]
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']
    outinfo = os.environ["BATCHPAR_outinfo"]
    output_bucket = 'clio-text2vec'

    # reduce records in test mode
    if test:
        limit = 50
        logging.info(f"Limiting to {limit} rows in test mode")
    else:
        limit = None
    # database setup
    logging.info(f"Using {db_name} database")

    # Get IDs from S3
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(ids)} article IDs retrieved from s3")

    # Connect to SQL
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    with db_session(engine) as session:
        batch_records = (
            session.query(Projects.abstractText).filter(Projects.id.in_(ids))
            # .limit(limit)
            .all())

    # Process and insert data
    vectors = docs2vectors([batch.abstractText for batch in batch_records])
    processed_batch = {
        id_: vector.tolist()
        for id_, vector in zip(ids, vectors)
    }
    logging.info(f"Inserting {len(processed_batch)} rows")

    # Store batched vectors in S3
    s3 = boto3.resource('s3')
    obj = s3.Object(output_bucket, f'{outinfo}.json')
    obj.put(Body=json.dumps(processed_batch))
Пример #25
0
def get_objects(from_date):
    """Get all arXiv articles from a given start date.

    Args:
        from_date (str, optional): Min article creation date.

    Returns:
        articles (list): List of arXiv article data.
    """
    logging.info(f"Retrieving articles from at least '{from_date}'")
    engine = get_mysql_engine()
    with db_session(engine) as session:
        query = session.query(Art.id, Art.abstract, Art.title, Art.created)
        query = query.filter(Art.created >= from_date)
        query = query.filter(Art.abstract.isnot(None))
        articles = [
            dict(id=id, text=abstract, title=title, created=created)
            for id, abstract, title, created in query.all()
        ]
    return articles
Пример #26
0
    def requires(self):
        """
        Collects the last date of successful update from the database and launches the
        iterative data collection task.
        """
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        if self.articles_from_date is None:
            logging.info("Extracting latest update date from database")
            query = text("SELECT update_id FROM luigi_table_updates "
                         f"WHERE update_id LIKE '{UPDATE_PREFIX}%'")
            with db_session(self.engine) as session:
                previous_updates = session.execute(query).fetchall()
            previous_updates = [
                update_id for (update_id, ) in previous_updates
            ]
            try:
                latest_update = extract_last_update_date(
                    UPDATE_PREFIX, previous_updates)
            except ValueError:
                raise ValueError(
                    "Date for iterative data collection could not be determined. Set the date manually with --articles-from-date"
                )
            self.articles_from_date = datetime.strftime(
                latest_update, '%Y-%m-%d')

        logging.info(
            f"Updating arxiv data from date: {self.articles_from_date}")

        yield CollectNewTask(date=self.date,
                             _routine_id=self._routine_id,
                             db_config_path=self.db_config_path,
                             db_config_env=self.db_config_env,
                             test=self.test,
                             insert_batch_size=self.insert_batch_size,
                             articles_from_date=self.articles_from_date)
Пример #27
0
def get_core_topics(engine, core_categories, members_limit, perc=99):
    """Get the most frequent topics from a selection of meetup categories,
    from the database.
    
    Args:
        engine: A SQL alchemy connectable.
        core_categories (list): A list of category_shortnames.
        members_limit (int): Minimum number of members required in a group 
                             for it to be considered.
        perc (int): A percentile to evaluate the most frequent topics.
    Returns:
        topics (set): The set of most frequent topics.
    """
    with db_session(engine) as session:
        rows = (session.query(
            Group.topics).filter(Group.members >= members_limit).filter(
                Group.category_shortname.in_(core_categories)).all())
        rows = [r.topics for r in rows]
    topic_counts = Counter(t['name'] for topics in rows for t in topics)
    topic_cutoff = np.percentile(list(float(v) for v in topic_counts.values()),
                                 perc)
    return set(k for k, v in topic_counts.items() if v >= topic_cutoff)
Пример #28
0
    def prepare(self):
        if self.test:
            self.process_batch_size = 100
        # MySQL setup
        database = 'dev' if self.test else 'production'
        engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        # Subtract off all done ids
        Base.metadata.create_all(engine)
        with db_session(engine) as session:
            result = session.query(Project.rcn).all()
            done_rcn = {r[0] for r in result}

        # Get all possible ids (or "RCN" in Cordis-speak)
        nrows = 1000 if self.test else None
        all_rcn = set(
            get_framework_ids('fp7', nrows=nrows) +
            get_framework_ids('h2020', nrows=nrows))
        all_rcn = all_rcn - done_rcn

        # Generate the job params
        batches = split_batches(all_rcn, self.process_batch_size)
        params = [{
            "batch_file":
            put_s3_batch(batch, self.intermediate_bucket, self.routine_id),
            "config":
            'mysqldb.config',
            "db_name":
            database,
            "bucket":
            self.intermediate_bucket,
            "outinfo":
            'dummy',
            "done":
            False,
            'test':
            self.test
        } for batch in batches]
        return params
Пример #29
0
def get_lat_lon():
    """Get all institutes in arXiv which are in Europe

    Returns:
       institutes (tuple): (institute_id, latitude, longitude)
    """
    logging.info("Retrieving articles lat lon lookup")
    engine = get_mysql_engine()
    with db_session(engine) as session:
        # Join institutes to articles via the link table
        q = session.query(Inst.id, Inst.latitude, Inst.longitude)
        q = q.join(Link, Link.institute_id == Inst.id, isouter=True)
        q = q.join(Art, Link.article_id == Art.id, isouter=True)
        # Skip institutes without geo info
        for field in (Inst.id, Art.id, Inst.latitude, Inst.longitude):
            q = q.filter(field.isnot(None))
        # Skip institutes outside of Europe
        q = q.filter(Inst.country_code.in_(EU_COUNTRIES))
        # Group by, in order to deduplicate institutes
        q = q.group_by(Inst.id)
        # Make the request
        return list(q.all())
Пример #30
0
def grid_name_lookup(engine):
    """Constructs a lookup table of Institute names to ids by combining names with
    aliases and cleaned names containing country names in brackets. Multinationals are
    detected.

    Args:
        engine (:obj:`sqlalchemy.engine.base.Engine`): connection to the database

    Returns:
        (:obj:`list` of :obj:`dict`): lookup table [{name: [id1, id2, id3]}]
                Where ids are different country entities for multinational institutes.
                Most entities just have a singe [id1]
    """
    with db_session(engine) as session:
        institute_name_id_lookup = {institute.name.lower(): [institute.id]
                                    for institute in session.query(Institute).all()}
        logging.info(f"{len(institute_name_id_lookup)} institutes in GRID")

        for alias in session.query(Alias).all():
            institute_name_id_lookup.update({alias.alias.lower(): [alias.grid_id]})
        logging.info(f"{len(institute_name_id_lookup)} institutes after adding aliases")

        # look for institute names containing brackets: IBM (United Kingdom)
        with_country = defaultdict(list)
        for bracketed in (session
                          .query(Institute)
                          .filter(Institute.name.contains('(') & Institute.name.contains(')'))
                          .all()):
            found = re.match(r'(.*) \((.*)\)', bracketed.name)
            if found:
                # combine all matches to a cleaned and lowered country name {IBM : [grid_id1, grid_id2]}
                with_country[found.groups()[0].lower()].append(bracketed.id)
        logging.info(f"{len(with_country)} institutes with country name in the title")

        # append cleaned names to the lookup table
        institute_name_id_lookup.update(with_country)
        logging.info(f"{len(institute_name_id_lookup)} total institute names in lookup")

    return institute_name_id_lookup