def test_the_limited(self): # "The Limited" is a very old name for L Brands insert_rows(self.scratch_db, 'company_name', [ dict(company_name='The Limited', scraper_id='corrections/company_name'), dict(company='L Brands', company_name='The Limited', is_alias=1, scraper_id='corrections/company_name'), ]) insert_rows(self.scratch_db, 'rating', [ dict(campaign_id='hsus_fur_free', company='The Limited', judgment=1, scraper_id='campaign/hsus_fur_free'), ]) build_company_name_and_scraper_company_map_tables( self.output_db, self.scratch_db) company_map = { (row['scraper_id'], row['scraper_company']): row['company'] for row in select_all(self.output_db, 'scraper_company_map') } self.assertEqual( company_map.get(('campaign/hsus_fur_free', 'The Limited')), 'L Brands')
def test_no_single_letter_company_names(self): insert_rows(self.scratch_db, 'company', [ dict(company='L Brands', scraper_id='sr.campaign.hrc'), ]) build_company_name_and_scraper_company_map_tables( self.output_db, self.scratch_db) rows = select_all(self.output_db, 'scraper_company_map') companies = {row['company'] for row in rows} self.assertEqual(companies, {'L Brands'})
def test_dont_merge_l_international_and_l_brands(self): # this tests #33 insert_rows(self.scratch_db, 'company', [ dict(company='L. International', scraper_id='sr.campaign.b_corp'), dict(company='L Brands', scraper_id='sr.campaign.hrc'), ]) build_company_name_and_scraper_company_map_tables( self.output_db, self.scratch_db) rows = select_all(self.output_db, 'scraper_company_map') companies = {row['company'] for row in rows} self.assertEqual(len(rows), 2) self.assertIn('L.', companies)
def test_news_corporation(self): # mark company name as invariant insert_rows(self.scratch_db, 'company_name', [ dict(company_name='News Corporation', scraper_id='corrections.company_name'), ]) insert_rows(self.scratch_db, 'company', [ dict(company='News Corporation', scraper_id='campaign.climate_counts'), ]) build_company_name_and_scraper_company_map_tables( self.output_db, self.scratch_db) rows = select_all(self.output_db, 'scraper_company_map') companies = {row['company'] for row in rows} self.assertEqual(companies, {'News Corporation'})
def test_asus(self): # make sure company_name from company_name table is used insert_rows(self.scratch_db, 'company_name', [ dict(company='ASUS', company_name='ASUSTek Computer Inc.', is_full=1, scraper_id='corrections/company_name') ]) build_company_name_and_scraper_company_map_tables( self.output_db, self.scratch_db) map_rows = select_all(self.output_db, 'scraper_company_map') companies = {row['company'] for row in map_rows} self.assertEqual(companies, {'ASUS'}) name_rows = select_all(self.output_db, 'company_name') company_fulls = {row['company_name'] for row in name_rows if row['is_full']} self.assertEqual(company_fulls, {'ASUSTek Computer Inc.'})
def test_merge_pvh_and_pvh_corp(self): # regression test for separate PVH, PVH Corp. insert_rows(self.scratch_db, 'company', [ dict(company='PVH', scraper_id='campaign.hrc'), dict(company='PVH Corp', scraper_id='campaign.btb_fashion'), ]) build_company_name_and_scraper_company_map_tables( self.output_db, self.scratch_db) # verify that companies merged map_rows = select_all(self.output_db, 'scraper_company_map') self.assertEqual(len(map_rows), 2) self.assertEqual(set(row['company'] for row in map_rows), {'PVH'}) name_rows = select_all(self.output_db, 'scraper_company_map') self.assertEqual(len(map_rows), 2) for row in name_rows: if row.get('is_full'): # add trailing period self.assertEqual(row['company_name'], 'PVH Corp.')
def build_tables(self): build_company_name_and_scraper_company_map_tables(self.output_db, self.scratch_db) build_subsidiary_table(self.output_db, self.scratch_db)