def test_process_orgs_generates_org_cats_link_table( self, valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs): _, org_cats, _ = process_orgs(valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs) expected_result = [{ 'organization_id': '1-1', 'category_name': 'data' }, { 'organization_id': '1-1', 'category_name': 'digital' }, { 'organization_id': '1-1', 'category_name': 'cats' }, { 'organization_id': '2-2', 'category_name': 'science' }, { 'organization_id': '2-2', 'category_name': 'cats' }, { 'organization_id': '3-3', 'category_name': 'data' }] assert org_cats == expected_result
def test_process_orgs_returns_missing_cat_groups(self, invalid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs): _, org_cats, missing_cat_groups = process_orgs(invalid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs) expected_org_cats = [{ 'organization_id': '1-1', 'category_name': 'data' }, { 'organization_id': '1-1', 'category_name': 'digital' }, { 'organization_id': '1-1', 'category_name': 'dogs' }, { 'organization_id': '2-2', 'category_name': 'science' }, { 'organization_id': '2-2', 'category_name': 'cats' }, { 'organization_id': '2-2', 'category_name': 'goats' }] missing_cats = {c['category_name'] for c in missing_cat_groups} expected_missing_cat_groups = {'dogs', 'goats'} assert org_cats == expected_org_cats assert missing_cats == expected_missing_cat_groups
def test_process_orgs_removes_redundant_category_columns( self, valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs): processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs) processed_orgs = pd.DataFrame(processed_orgs) assert 'category_list' not in processed_orgs assert 'category_group_list' not in processed_orgs
def test_process_orgs_removes_country_code_column(self, valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs): processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs) processed_orgs = pd.DataFrame(processed_orgs) assert 'country_code' not in processed_orgs
def test_process_orgs_renames_uuid_column(self, valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs): processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs) processed_orgs = pd.DataFrame(processed_orgs) assert 'id' in processed_orgs assert 'uuid' not in processed_orgs
def test_process_orgs_correctly_applies_country_name( self, valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs): processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs) processed_orgs = pd.DataFrame(processed_orgs) expected_result = pd.Series(['France', 'Germany', 'United Kingdom']) assert_series_equal(processed_orgs['country'], expected_result, check_names=False)
def test_process_orgs_removes_existing_orgs(self, valid_org_data, existing_orgs, valid_cat_groups, valid_org_descs): processed_orgs, _, _ = process_orgs(valid_org_data, existing_orgs, valid_cat_groups, valid_org_descs) processed_orgs = pd.DataFrame(processed_orgs) expected_result = pd.Series(['1-1']) assert_series_equal(processed_orgs['id'], expected_result, check_names=False)
def test_process_orgs_inserts_none_if_composite_key_fails( self, invalid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs): processed_orgs, _, _ = process_orgs(invalid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs) processed_orgs = pd.DataFrame(processed_orgs) expected_result = pd.Series( [None, 'berlin_germany', 'london_united-kingdom']) assert_series_equal(processed_orgs.location_id, expected_result, check_names=False)
def test_process_orgs_generates_location_id_composite_keys( self, valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs): processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs, valid_cat_groups, valid_org_descs) processed_orgs = pd.DataFrame(processed_orgs) expected_result = pd.Series( ['paris_france', 'berlin_germany', 'london_united-kingdom']) assert_series_equal(processed_orgs.location_id, expected_result, check_names=False)
def test_process_orgs_inserts_none_for_unfound_long_descriptions( self, valid_org_data, no_existing_orgs, valid_cat_groups, invalid_org_descs): processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs, valid_cat_groups, invalid_org_descs) processed_orgs = pd.DataFrame(processed_orgs) expected_result = pd.DataFrame({ 'id': ['1-1', '2-2', '3-3'], 'long_description': [None, 'org two', 'org three'] }) assert_frame_equal(processed_orgs[['id', 'long_description']], expected_result, check_like=True)
def run(self): """Collect and process organizations, categories and long descriptions.""" # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) try_until_allowed(Base.metadata.create_all, self.engine) # collect files nrows = 200 if self.test else None cat_groups, orgs, org_descriptions = get_files_from_tar( ['category_groups', 'organizations', 'organization_descriptions'], nrows=nrows) # process category_groups cat_groups = rename_uuid_columns(cat_groups) insert_data(self.db_config_env, 'mysqldb', database, Base, CategoryGroup, cat_groups.to_dict(orient='records'), low_memory=True) # process organizations and categories with db_session(self.engine) as session: existing_orgs = session.query(Organization.id).all() existing_orgs = {org[0] for org in existing_orgs} logging.info("Summary of organisation data:") logging.info(f"Total number of organisations:\t {len(orgs)}") logging.info( f"Number of organisations already in the database:\t {len(existing_orgs)}" ) logging.info(f"Number of category groups and text descriptions:\t" f"{len(cat_groups)}, {len(org_descriptions)}") processed_orgs, org_cats, missing_cat_groups = process_orgs( orgs, existing_orgs, cat_groups, org_descriptions) # Insert CatGroups insert_data(self.db_config_env, 'mysqldb', database, Base, CategoryGroup, missing_cat_groups) # Insert orgs in batches n_batches = round(len(processed_orgs) / self.insert_batch_size) logging.info( f"Inserting {n_batches} batches of size {self.insert_batch_size}") for i, batch in enumerate( split_batches(processed_orgs, self.insert_batch_size)): if i % 100 == 0: logging.info(f"Inserting batch {i} of {n_batches}") insert_data(self.db_config_env, 'mysqldb', database, Base, Organization, batch, low_memory=True) # link table needs to be inserted via non-bulk method to enforce relationship logging.info("Filtering duplicates...") org_cats, existing_org_cats, failed_org_cats = filter_out_duplicates( self.db_config_env, 'mysqldb', database, Base, OrganizationCategory, org_cats, low_memory=True) logging.info( f"Inserting {len(org_cats)} org categories " f"({len(existing_org_cats)} already existed and {len(failed_org_cats)} failed)" ) #org_cats = [OrganizationCategory(**org_cat) for org_cat in org_cats] with db_session(self.engine) as session: session.add_all(org_cats) # mark as done self.output().touch()