def test_get_files_from_tar_limits_rows(mocked_crunchbase_tar, crunchbase_tarfile): mocked_crunchbase_tar.return_value = tarfile.open(crunchbase_tarfile.name) expected_result = pd.DataFrame({'id': [111], 'data': ['aaa']}) dfs = get_files_from_tar(['test_0'], nrows=1) # only return 1 row assert_frame_equal(dfs[0], expected_result, check_like=True)
def test_get_files_from_tar(mocked_crunchbase_tar, crunchbase_tarfile): mocked_crunchbase_tar.return_value = tarfile.open(crunchbase_tarfile.name) expected_result = pd.DataFrame({'id': [111, 222], 'data': ['aaa', 'bbb']}) dfs = get_files_from_tar(['test_0']) assert type(dfs) == list assert_frame_equal(dfs[0], expected_result, check_like=True)
def run(self): # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) # collect file logging.info(f"Collecting org_parents from crunchbase tar") org_parents = get_files_from_tar(['org_parents'])[0] logging.info(f"{len(org_parents)} parent ids in crunchbase export") # collect previously processed orgs logging.info("Extracting previously processed organisations") with db_session(self.engine) as session: processed_orgs = session.query(Organization.id, Organization.parent_id).all() all_orgs = {org for (org, _) in processed_orgs} logging.info(f"{len(all_orgs)} total orgs in database") processed_orgs = { org for (org, parent_id) in processed_orgs if parent_id is not None } logging.info(f"{len(processed_orgs)} previously processed orgs") # reformat into a list of dicts, removing orgs that already have a parent_id # or are missing from the database org_parents = org_parents[['uuid', 'parent_uuid']] org_parents.columns = ['id', 'parent_id'] org_parents = org_parents[org_parents['id'].isin(all_orgs)] org_parents = org_parents[~org_parents['id'].isin(processed_orgs)] org_parents = org_parents.to_dict(orient='records') logging.info(f"{len(org_parents)} organisations to update in MYSQL") # insert parent_ids into db in batches for count, batch in enumerate( split_batches(org_parents, self.insert_batch_size), 1): with db_session(self.engine) as session: session.bulk_update_mappings(Organization, batch) logging.info( f"{count} batch{'es' if count > 1 else ''} written to db") if self.test and count > 1: logging.info("Breaking after 2 batches while in test mode") break # mark as done logging.warning("Task complete") self.output().touch()
def run(): test = literal_eval(os.environ["BATCHPAR_test"]) db_name = os.environ["BATCHPAR_db_name"] table = os.environ["BATCHPAR_table"] batch_size = int(os.environ["BATCHPAR_batch_size"]) s3_path = os.environ["BATCHPAR_outinfo"] logging.warning(f"Processing {table} file") # database setup engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) try_until_allowed(Base.metadata.create_all, engine) table_name = f"crunchbase_{table}" table_class = get_class_by_tablename(Base, table_name) # collect file nrows = 1000 if test else None df = get_files_from_tar([table], nrows=nrows)[0] logging.warning(f"{len(df)} rows in file") # get primary key fields and set of all those already existing in the db pk_cols = list(table_class.__table__.primary_key.columns) pk_names = [pk.name for pk in pk_cols] with db_session(engine) as session: existing_rows = set(session.query(*pk_cols).all()) # process and insert data processed_rows = process_non_orgs(df, existing_rows, pk_names) for batch in split_batches(processed_rows, batch_size): insert_data("BATCHPAR_config", 'mysqldb', db_name, Base, table_class, processed_rows, low_memory=True) logging.warning(f"Marking task as done to {s3_path}") s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="") logging.warning("Batch job complete.")
def run(self): """Collect and process organizations, categories and long descriptions.""" # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) try_until_allowed(Base.metadata.create_all, self.engine) # collect files nrows = 200 if self.test else None cat_groups, orgs, org_descriptions = get_files_from_tar( ['category_groups', 'organizations', 'organization_descriptions'], nrows=nrows) # process category_groups cat_groups = rename_uuid_columns(cat_groups) insert_data(self.db_config_env, 'mysqldb', database, Base, CategoryGroup, cat_groups.to_dict(orient='records'), low_memory=True) # process organizations and categories with db_session(self.engine) as session: existing_orgs = session.query(Organization.id).all() existing_orgs = {org[0] for org in existing_orgs} logging.info("Summary of organisation data:") logging.info(f"Total number of organisations:\t {len(orgs)}") logging.info( f"Number of organisations already in the database:\t {len(existing_orgs)}" ) logging.info(f"Number of category groups and text descriptions:\t" f"{len(cat_groups)}, {len(org_descriptions)}") processed_orgs, org_cats, missing_cat_groups = process_orgs( orgs, existing_orgs, cat_groups, org_descriptions) # Insert CatGroups insert_data(self.db_config_env, 'mysqldb', database, Base, CategoryGroup, missing_cat_groups) # Insert orgs in batches n_batches = round(len(processed_orgs) / self.insert_batch_size) logging.info( f"Inserting {n_batches} batches of size {self.insert_batch_size}") for i, batch in enumerate( split_batches(processed_orgs, self.insert_batch_size)): if i % 100 == 0: logging.info(f"Inserting batch {i} of {n_batches}") insert_data(self.db_config_env, 'mysqldb', database, Base, Organization, batch, low_memory=True) # link table needs to be inserted via non-bulk method to enforce relationship logging.info("Filtering duplicates...") org_cats, existing_org_cats, failed_org_cats = filter_out_duplicates( self.db_config_env, 'mysqldb', database, Base, OrganizationCategory, org_cats, low_memory=True) logging.info( f"Inserting {len(org_cats)} org categories " f"({len(existing_org_cats)} already existed and {len(failed_org_cats)} failed)" ) #org_cats = [OrganizationCategory(**org_cat) for org_cat in org_cats] with db_session(self.engine) as session: session.add_all(org_cats) # mark as done self.output().touch()