Exemplo n.º 1
0
    def run(self):
        db = 'production' if not self.test else 'dev'

        keys = self.get_abstract_file_keys(bucket, key_prefix)
        
        engine = get_mysql_engine(self.db_config_env, 'mysqldb', db)
        with db_session(engine) as session:
            
            if self.test:
                existing_projects = set()
                projects = session.query(Projects.application_id).distinct()
                for p in projects:
                    existing_projects.update(int(p.application_id))
            
            projects_done = set()
            projects_mesh = session.query(ProjectMeshTerms.project_id).distinct()
            for p in projects_mesh:
                projects_done.update(int(p.project_id))
            
            mesh_term_ids = {int(m.id) for m in session.query(MeshTerms.id).all()}

        logging.info('Inserting associations')
        
        for key_count, key in enumerate(keys):
            if self.test and (key_count > 2):
                continue
            # collect mesh results from s3 file and groups by project id
            # each project id has set of mesh terms and corresponding term ids
            df_mesh = retrieve_mesh_terms(bucket, key)
            project_terms = self.format_mesh_terms(df_mesh)
            # go through documents
            for project_count, (project_id, terms) in enumerate(project_terms.items()):
                rows = []
                if self.test and (project_count > 2):
                    continue
                if (project_id in projects_done) or (project_id not in existing_projects):
                    continue

                for term, term_id in zip(terms['terms'], terms['ids']):
                    term_id = int(term_id)
                    # add term to mesh term table if not present
                    if term_id not in mesh_term_ids:
                        objs = insert_data(
                                self.db_config_env, 'mysqldb', db, Base, MeshTerms, 
                                [{'id': term_id, 'term': term}], low_memory=True)
                        mesh_term_ids.update({term_id})
                    # prepare row to be added to project-mesh_term link table
                    rows.append({'project_id': project_id, 'mesh_term_id': term_id})
                # inesrt rows to link table
                insert_data(self.db_config_env, 'mysqldb', db, Base, 
                        ProjectMeshTerms, rows, low_memory=True)
        self.output().touch() # populate project-mesh_term link table
Exemplo n.º 2
0
    def run(self):
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        # collect mesh terms from S3
        bucket = 'innovation-mapping-general'
        key = 'crunchbase_descriptions/crunchbase_descriptions_mesh.txt'
        mesh_terms = retrieve_mesh_terms(bucket, key)
        mesh_terms = format_mesh_terms(
            mesh_terms)  # [{'id': ['term1', 'term2']}, ...]
        logging.info(f"File contains {len(mesh_terms)} orgs with mesh terms")

        logging.info("Extracting previously processed orgs")
        with db_session(self.engine) as session:
            all_orgs = session.query(Organization.id,
                                     Organization.mesh_terms).all()
        processed_orgs = {
            org_id
            for (org_id, mesh_terms) in all_orgs if mesh_terms is not None
        }
        all_orgs = {org_id for (org_id, _) in all_orgs}
        logging.info(f"{len(all_orgs)} total orgs in database")
        logging.info(f"{len(processed_orgs)} previously processed orgs")

        # reformat for batch insert, removing not found and previously processed terms
        meshed_orgs = [{
            'id': org_id,
            'mesh_terms': '|'.join(terms)
        } for org_id, terms in mesh_terms.items()
                       if org_id in all_orgs and org_id not in processed_orgs]

        logging.info(f"{len(meshed_orgs)} organisations to update in database")

        for count, batch in enumerate(
                split_batches(meshed_orgs, self.insert_batch_size), 1):
            with db_session(self.engine) as session:
                session.bulk_update_mappings(Organization, batch)
            logging.info(
                f"{count} batch{'es' if count > 1 else ''} written to db")
            if self.test and count > 1:
                logging.info("Breaking after 2 batches while in test mode")
                break

        # mark as done
        logging.warning("Task complete")
        self.output().touch()
Exemplo n.º 3
0
def run():

    # Fetch the input parameters
    s3_bucket = os.environ["BATCHPAR_bucket"]
    batch_file = os.environ["BATCHPAR_batch_file"]
    members_perc = int(os.environ["BATCHPAR_members_perc"])
    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]
    routine_id = os.environ["BATCHPAR_routine_id"]

    # Get continent lookup
    url = ("https://nesta-open-data.s3.eu-west-2"
           ".amazonaws.com/rwjf-viz/continent_codes_names.json")
    continent_lookup = {row["Code"]: row["Name"] 
                        for row in requests.get(url).json()}
    continent_lookup[None] = None

    # Extract the core topics
    logging.debug('Getting topics')
    s3 = boto3.resource('s3')
    topics_key = f'meetup-topics-{routine_id}.json'
    topics_obj = s3.Object(s3_bucket, topics_key)
    core_topics = set(json.loads(topics_obj.get()['Body']._raw_stream.read()))

    # Extract the group ids for this task
    ids_obj = s3.Object(s3_bucket, batch_file)
    group_ids = set(json.loads(ids_obj.get()['Body']._raw_stream.read()))

    # Extract the mesh terms for this task
    mesh_obj = s3.Object('innovation-mapping-general', 
                         'meetup_mesh/meetup_mesh_processed.txt')
    df_mesh = retrieve_mesh_terms('innovation-mapping-general',
                                  'meetup_mesh/meetup_mesh_processed.txt')
    mesh_terms = format_mesh_terms(df_mesh)

    # Setup ES+
    field_null_mapping = load_json_from_pathstub(("tier_1/"
                                                  "field_null_mappings/"),
                                                 "health_scanner.json")
    strans_kwargs={'filename':'meetup.json',
                   'from_key':'tier_0',
                   'to_key':'tier_1',
                   'ignore':[]}
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           field_null_mapping=field_null_mapping,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           country_detection=True,
                           auto_translate=True)

    # Generate the lookup for geographies
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    geo_lookup = {}
    with db_session(engine) as session:
        query_result = session.query(Geographic).all()
        for geography in query_result:
            geo_lookup[geography.id] = {k: v for k, v in 
                                        geography.__dict__.items()
                                        if k in geography.__table__.columns}

    # Pipe the groups
    members_limit = get_members_by_percentile(engine, perc=members_perc)
    with db_session(engine) as session:
        query_result = (session
                        .query(Group)
                        .filter(Group.members >= members_limit)
                        .filter(Group.id.in_(group_ids))
                        .all())
        for count, group in enumerate(query_result, 1):
            row = {k: v for k, v in group.__dict__.items()
                   if k in group.__table__.columns}

            # Filter groups without the required topics
            topics = [topic['name'] for topic in group.topics
                      if topic['name'] in core_topics]
            if len(topics) == 0:
                continue

            # Assign mesh terms
            mesh_id = f'{row["id"]}'.zfill(8)
            row['mesh_terms'] = None
            if mesh_id in mesh_terms:
                row['mesh_terms'] = mesh_terms[mesh_id]

            # Get the geographic data for this row
            country_name = country_iso_code_to_name(row['country'], iso2=True)
            geo_key = generate_composite_key(row['city'], country_name)
            geo = geo_lookup[geo_key]

            # Clean up the input data
            row['topics'] = topics
            row['urlname'] = f"https://www.meetup.com/{row['urlname']}"
            row['coordinate'] = dict(lat=geo['latitude'], lon=geo['longitude'])
            row['created'] = dt.strftime(dt.fromtimestamp(row['created']/1000), 
                                         format="%Y-%m-%d")
            if row['description'] is not None:
                row['description'] = BeautifulSoup(row['description'], 'lxml').text                
            row['continent'] = continent_lookup[geo['continent']]
            row['country_name'] = geo['country']
            row['continent_id'] = geo['continent']
            row['country'] = geo['country_alpha_2']
            row['iso3'] = geo['country_alpha_3']
            row['isoNumeric'] = geo['country_numeric']

            # Insert to ES
            _row = es.index(index=es_index, doc_type=es_type,
                            id=row['id'], body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to elasticsearch")

    logging.info("Batch job complete.")
Exemplo n.º 4
0
def run():
    bucket = os.environ["BATCHPAR_s3_bucket"]
    abstract_file = os.environ["BATCHPAR_s3_key"]
    dupe_file = os.environ["BATCHPAR_dupe_file"]
    es_config = literal_eval(os.environ["BATCHPAR_outinfo"])
    db = os.environ["BATCHPAR_db"]
    entity_type = os.environ["BATCHPAR_entity_type"]

    # mysql setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db)
    Session = sessionmaker(bind=engine)
    session = Session()

    # retrieve a batch of meshed terms
    mesh_terms = retrieve_mesh_terms(bucket, abstract_file)
    mesh_terms = format_mesh_terms(mesh_terms)
    logging.info(f'batch {abstract_file} contains '
                 f'{len(mesh_terms)} meshed abstracts')

    # retrieve duplicate map
    dupes = retrieve_duplicate_map(bucket, dupe_file)
    dupes = format_duplicate_map(dupes)

    # Set up elastic search connection
    field_null_mapping = load_json_from_pathstub(
        "tier_1/"
        "field_null_mappings/", "health_scanner.json")
    es = ElasticsearchPlus(hosts=es_config['host'],
                           port=es_config['port'],
                           aws_auth_region=es_config['region'],
                           use_ssl=True,
                           entity_type=entity_type,
                           strans_kwargs=None,
                           field_null_mapping=field_null_mapping,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           country_detection=True,
                           listify_terms=True)
    all_es_ids = get_es_ids(es, es_config)

    docs = []
    for doc_id, terms in mesh_terms.items():
        if doc_id not in all_es_ids:
            continue
        try:
            _filter = Abstracts.application_id == doc_id
            abstract = (session.query(Abstracts).filter(_filter).one())
        except NoResultFound:
            logging.warning(f'Not found {doc_id} in database')
            raise NoResultFound(doc_id)
        clean_abstract_text = clean_abstract(abstract.abstract_text)
        docs.append({
            'doc_id': doc_id,
            'terms_mesh_abstract': terms,
            'textBody_abstract_project': clean_abstract_text
        })
        duped_docs = dupes.get(doc_id, [])
        if len(duped_docs) > 0:
            logging.info(f'Found {len(duped_docs)} duplicates')
        for duped_doc in duped_docs:
            docs.append({
                'doc_id': duped_doc,
                'terms_mesh_abstract': terms,
                'textBody_abstract_project': clean_abstract_text,
                'booleanFlag_duplicate_abstract': True
            })

    # output to elasticsearch
    logging.warning(f'Writing {len(docs)} documents to elasticsearch')
    for doc in docs:
        uid = doc.pop("doc_id")
        # Extract existing info
        existing = es.get(es_config['index'],
                          doc_type=es_config['type'],
                          id=uid)['_source']
        # Merge existing info into new doc
        doc = {**existing, **doc}
        es.index(index=es_config['index'],
                 doc_type=es_config['type'],
                 id=uid,
                 body=doc)