예제 #1
0
    def test_process_orgs_generates_org_cats_link_table(
            self, valid_org_data, no_existing_orgs, valid_cat_groups,
            valid_org_descs):
        _, org_cats, _ = process_orgs(valid_org_data, no_existing_orgs,
                                      valid_cat_groups, valid_org_descs)
        expected_result = [{
            'organization_id': '1-1',
            'category_name': 'data'
        }, {
            'organization_id': '1-1',
            'category_name': 'digital'
        }, {
            'organization_id': '1-1',
            'category_name': 'cats'
        }, {
            'organization_id': '2-2',
            'category_name': 'science'
        }, {
            'organization_id': '2-2',
            'category_name': 'cats'
        }, {
            'organization_id': '3-3',
            'category_name': 'data'
        }]

        assert org_cats == expected_result
예제 #2
0
    def test_process_orgs_returns_missing_cat_groups(self, invalid_org_data,
                                                     no_existing_orgs,
                                                     valid_cat_groups,
                                                     valid_org_descs):
        _, org_cats, missing_cat_groups = process_orgs(invalid_org_data,
                                                       no_existing_orgs,
                                                       valid_cat_groups,
                                                       valid_org_descs)
        expected_org_cats = [{
            'organization_id': '1-1',
            'category_name': 'data'
        }, {
            'organization_id': '1-1',
            'category_name': 'digital'
        }, {
            'organization_id': '1-1',
            'category_name': 'dogs'
        }, {
            'organization_id': '2-2',
            'category_name': 'science'
        }, {
            'organization_id': '2-2',
            'category_name': 'cats'
        }, {
            'organization_id': '2-2',
            'category_name': 'goats'
        }]
        missing_cats = {c['category_name'] for c in missing_cat_groups}
        expected_missing_cat_groups = {'dogs', 'goats'}

        assert org_cats == expected_org_cats
        assert missing_cats == expected_missing_cat_groups
예제 #3
0
    def test_process_orgs_removes_redundant_category_columns(
            self, valid_org_data, no_existing_orgs, valid_cat_groups,
            valid_org_descs):
        processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs,
                                            valid_cat_groups, valid_org_descs)
        processed_orgs = pd.DataFrame(processed_orgs)

        assert 'category_list' not in processed_orgs
        assert 'category_group_list' not in processed_orgs
예제 #4
0
    def test_process_orgs_removes_country_code_column(self, valid_org_data,
                                                      no_existing_orgs,
                                                      valid_cat_groups,
                                                      valid_org_descs):
        processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs,
                                            valid_cat_groups, valid_org_descs)
        processed_orgs = pd.DataFrame(processed_orgs)

        assert 'country_code' not in processed_orgs
예제 #5
0
    def test_process_orgs_renames_uuid_column(self, valid_org_data,
                                              no_existing_orgs,
                                              valid_cat_groups,
                                              valid_org_descs):
        processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs,
                                            valid_cat_groups, valid_org_descs)
        processed_orgs = pd.DataFrame(processed_orgs)

        assert 'id' in processed_orgs
        assert 'uuid' not in processed_orgs
예제 #6
0
    def test_process_orgs_correctly_applies_country_name(
            self, valid_org_data, no_existing_orgs, valid_cat_groups,
            valid_org_descs):
        processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs,
                                            valid_cat_groups, valid_org_descs)
        processed_orgs = pd.DataFrame(processed_orgs)
        expected_result = pd.Series(['France', 'Germany', 'United Kingdom'])

        assert_series_equal(processed_orgs['country'],
                            expected_result,
                            check_names=False)
예제 #7
0
    def test_process_orgs_removes_existing_orgs(self, valid_org_data,
                                                existing_orgs,
                                                valid_cat_groups,
                                                valid_org_descs):
        processed_orgs, _, _ = process_orgs(valid_org_data, existing_orgs,
                                            valid_cat_groups, valid_org_descs)
        processed_orgs = pd.DataFrame(processed_orgs)

        expected_result = pd.Series(['1-1'])
        assert_series_equal(processed_orgs['id'],
                            expected_result,
                            check_names=False)
예제 #8
0
    def test_process_orgs_inserts_none_if_composite_key_fails(
            self, invalid_org_data, no_existing_orgs, valid_cat_groups,
            valid_org_descs):
        processed_orgs, _, _ = process_orgs(invalid_org_data, no_existing_orgs,
                                            valid_cat_groups, valid_org_descs)
        processed_orgs = pd.DataFrame(processed_orgs)
        expected_result = pd.Series(
            [None, 'berlin_germany', 'london_united-kingdom'])

        assert_series_equal(processed_orgs.location_id,
                            expected_result,
                            check_names=False)
예제 #9
0
    def test_process_orgs_generates_location_id_composite_keys(
            self, valid_org_data, no_existing_orgs, valid_cat_groups,
            valid_org_descs):
        processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs,
                                            valid_cat_groups, valid_org_descs)
        processed_orgs = pd.DataFrame(processed_orgs)
        expected_result = pd.Series(
            ['paris_france', 'berlin_germany', 'london_united-kingdom'])

        assert_series_equal(processed_orgs.location_id,
                            expected_result,
                            check_names=False)
예제 #10
0
    def test_process_orgs_inserts_none_for_unfound_long_descriptions(
            self, valid_org_data, no_existing_orgs, valid_cat_groups,
            invalid_org_descs):
        processed_orgs, _, _ = process_orgs(valid_org_data, no_existing_orgs,
                                            valid_cat_groups,
                                            invalid_org_descs)
        processed_orgs = pd.DataFrame(processed_orgs)
        expected_result = pd.DataFrame({
            'id': ['1-1', '2-2', '3-3'],
            'long_description': [None, 'org two', 'org three']
        })

        assert_frame_equal(processed_orgs[['id', 'long_description']],
                           expected_result,
                           check_like=True)
예제 #11
0
    def run(self):
        """Collect and process organizations, categories and long descriptions."""

        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        try_until_allowed(Base.metadata.create_all, self.engine)

        # collect files
        nrows = 200 if self.test else None
        cat_groups, orgs, org_descriptions = get_files_from_tar(
            ['category_groups', 'organizations', 'organization_descriptions'],
            nrows=nrows)
        # process category_groups
        cat_groups = rename_uuid_columns(cat_groups)
        insert_data(self.db_config_env,
                    'mysqldb',
                    database,
                    Base,
                    CategoryGroup,
                    cat_groups.to_dict(orient='records'),
                    low_memory=True)

        # process organizations and categories
        with db_session(self.engine) as session:
            existing_orgs = session.query(Organization.id).all()
        existing_orgs = {org[0] for org in existing_orgs}

        logging.info("Summary of organisation data:")
        logging.info(f"Total number of organisations:\t {len(orgs)}")
        logging.info(
            f"Number of organisations already in the database:\t {len(existing_orgs)}"
        )
        logging.info(f"Number of category groups and text descriptions:\t"
                     f"{len(cat_groups)}, {len(org_descriptions)}")

        processed_orgs, org_cats, missing_cat_groups = process_orgs(
            orgs, existing_orgs, cat_groups, org_descriptions)
        # Insert CatGroups
        insert_data(self.db_config_env, 'mysqldb', database, Base,
                    CategoryGroup, missing_cat_groups)
        # Insert orgs in batches
        n_batches = round(len(processed_orgs) / self.insert_batch_size)
        logging.info(
            f"Inserting {n_batches} batches of size {self.insert_batch_size}")
        for i, batch in enumerate(
                split_batches(processed_orgs, self.insert_batch_size)):
            if i % 100 == 0:
                logging.info(f"Inserting batch {i} of {n_batches}")
            insert_data(self.db_config_env,
                        'mysqldb',
                        database,
                        Base,
                        Organization,
                        batch,
                        low_memory=True)

        # link table needs to be inserted via non-bulk method to enforce relationship
        logging.info("Filtering duplicates...")
        org_cats, existing_org_cats, failed_org_cats = filter_out_duplicates(
            self.db_config_env,
            'mysqldb',
            database,
            Base,
            OrganizationCategory,
            org_cats,
            low_memory=True)
        logging.info(
            f"Inserting {len(org_cats)} org categories "
            f"({len(existing_org_cats)} already existed and {len(failed_org_cats)} failed)"
        )
        #org_cats = [OrganizationCategory(**org_cat) for org_cat in org_cats]
        with db_session(self.engine) as session:
            session.add_all(org_cats)

        # mark as done
        self.output().touch()