def scrape_company(): yield 'company', dict(company=COMPANY, url=COMPANY_URL) for brand in MORE_BRANDS: yield 'brand', dict(company=COMPANY, brand=brand) # get logo for brands brands_soup = scrape_soup(BRANDS_URL) sb_to_logo_url = {} # map smunch(brand) to logo_url for img in brands_soup.select('#scroller img'): sb = smunch(img['alt']) sb = SMUNCHED_BRAND_CORRECTIONS.get(sb, sb) logo_url = img['src'] sb_to_logo_url[sb] = logo_url cat_soup = scrape_soup(CATEGORY_URL) for a in cat_soup.select('li.active ul li a'): cat = a.text url = a['href'] # TODO: match brands with logos # treat "French's" as single brand # correct "Cillet Bang" -> "Cillit Bang" soup = scrape_soup(url) for h2 in soup.select('h2'): brand = h2.text.strip() if brand: # special case for French's for kb in KNOWN_BRANDS: if brand.startswith(kb + ' '): sub_cat = brand[len(kb) + 1:] yield 'subcategory', dict(category=cat, subcategory=sub_cat) brand = kb brand_cat = sub_cat # don't redefine cat else: brand_cat = cat yield 'brand', dict( company=COMPANY, brand=brand, category=brand_cat, logo_url = sb_to_logo_url.get(smunch(brand)))
def match_twitter_handle(th): if th in IGNORE_TWITTER_HANDLES: return sth = smunch(th[1:]) for i in range(len(sth), 1, -1): if (not th.endswith('Brand') and sth[:i] == sc[:i] and 'twitter_handle' not in c): c['twitter_handle'] = th return for sb in sbs: if sth[:i] == sb[:i] and sb not in sb2th: sb2th[sb] = th return else: if 'twitter_handle' not in c: c['twitter_handle'] = th else: log.warn(u'No matching brand/company for {} ({}: {})'.format( repr(th), company, u', '.join(brands)))
def scrape_company(url, known_brands): soup = scrape_soup(url) company = strip_company( soup.select('#company_score_company_title h1')[0].text) c = dict(company=company) # rating score = int(soup.select('#company_score_score')[0].text.strip()) r = dict(company=company, score=score, max_score=MAX_SCORE) status_path = soup.select('#company_score_status img')[0]['src'] r['description'], r['judgment'] = scrape_description_and_judgment( status_path) r['url'] = url # icon icon_as = soup.select('#company_score_company_icon a') if icon_as: icon_a = icon_as[0] c['url'] = icon_a['href'] c['logo_url'] = urljoin(url, icon_a.img['src']) # sector for a in soup.select('#breadcrumbs a'): if 'sectors' in a['href']: c['category'] = a.text break # ignore "Industry Innovators" category # match up brands to logos brands = sorted(known_brands[company]) sb2b = dict((smunch(b), b) for b in brands) for img in soup.select('span.brand_icon img'): logo_url = urljoin(url, img['src']) sb = smunch(img['alt']) sb = SMUNCHED_BRAND_CORRECTIONS.get(sb, sb) if sb in sb2b: brand = sb2b[sb] yield 'brand', dict( company=company, brand=brand, logo_url=logo_url) else: log.warn(u'No matching brand for {} ({}: {})'.format( repr(img['alt']), company, u', '.join(brands))) # match twitter handles to company/brand sc = smunch(company) sbs = sorted(sb2b) sb2th = {} twitter_handles = [ a.text.strip() for a in soup.select('#company_score_action_right a')] def match_twitter_handle(th): if th in IGNORE_TWITTER_HANDLES: return sth = smunch(th[1:]) for i in range(len(sth), 1, -1): if (not th.endswith('Brand') and sth[:i] == sc[:i] and 'twitter_handle' not in c): c['twitter_handle'] = th return for sb in sbs: if sth[:i] == sb[:i] and sb not in sb2th: sb2th[sb] = th return else: if 'twitter_handle' not in c: c['twitter_handle'] = th else: log.warn(u'No matching brand/company for {} ({}: {})'.format( repr(th), company, u', '.join(brands))) for th in twitter_handles: match_twitter_handle(th) for sb, th in sb2th.iteritems(): brand = sb2b[sb] yield 'brand', dict(company=company, brand=brand, twitter_handle=th) yield 'company', c yield 'rating', r # parse claims for b in soup.find(id='company_score').parent.select('b'): m = SUBSCORE_RE.match(b.text) if m and isinstance(b.next_sibling, unicode): # used this for debugging #area = m.group(1) area_score = int(m.group(2)) area_max_score = int(m.group(3)) raw_claim = b.next_sibling if NO_SPLIT_CLAIM_RE.match(raw_claim): claims = [raw_claim] else: claims = list(split_into_sentences(raw_claim)) for claim in claims: # strip company name off claim claim = ltrim_sentence(claim, [company, 'the company']) judgment = claim_to_judgment(claim) # if score is low, maybe it's not so positive after all if judgment == 1 and area_score / area_max_score < 0.5: judgment == 0 yield 'claim', dict(company=company, claim=claim, judgment=judgment)
SUBSCORE_RE = re.compile(r'^\s*([^:]+):\s+(\d+)/(\d+) points') NO_SPLIT_CLAIM_RE = re.compile(r'.*\b(also|however)\b.*', re.I) BRAND_CORRECTIONS = { # hilarious 'Climfast': 'Slimfast', 'Gatoraide': 'Gatorade', 'Litpon': 'Lipton', 'Mountain Des': 'Mountain Dew', 'Wgeaties': 'Wheaties', 'Talko Bell': 'Taco Bell', 'Siemans': 'Siemens', } SMUNCHED_BRAND_CORRECTIONS = dict( (smunch(bad), smunch(good)) for bad, good in BRAND_CORRECTIONS.iteritems()) IGNORE_TWITTER_HANDLES = { '@BritishAirways', # actually @British_Airways '@ShakleeUpdates', # now @ShakleeHQ '@UPS_News', # ignore in favor of @UPS '@theUPSstore_PR', # ignore in favor of @UPS '@', # derp } log = logging.getLogger(__name__) def scrape_campaign(): yield 'campaign', CAMPAIGN