def scrape_company():

    yield 'company', dict(company=COMPANY, url=COMPANY_URL)

    for brand in MORE_BRANDS:
        yield 'brand', dict(company=COMPANY, brand=brand)

    # get logo for brands
    brands_soup = scrape_soup(BRANDS_URL)

    sb_to_logo_url = {}  # map smunch(brand) to logo_url

    for img in brands_soup.select('#scroller img'):
        sb = smunch(img['alt'])
        sb = SMUNCHED_BRAND_CORRECTIONS.get(sb, sb)
        logo_url = img['src']

        sb_to_logo_url[sb] = logo_url

    cat_soup = scrape_soup(CATEGORY_URL)

    for a in cat_soup.select('li.active ul li a'):
        cat = a.text
        url = a['href']

        # TODO: match brands with logos
        # treat "French's" as single brand
        # correct "Cillet Bang" -> "Cillit Bang"

        soup = scrape_soup(url)
        for h2 in soup.select('h2'):
            brand = h2.text.strip()

            if brand:
                # special case for French's
                for kb in KNOWN_BRANDS:
                    if brand.startswith(kb + ' '):
                        sub_cat = brand[len(kb) + 1:]
                        yield 'subcategory', dict(category=cat,
                                                  subcategory=sub_cat)
                        brand = kb
                        brand_cat = sub_cat  # don't redefine cat
                    else:
                        brand_cat = cat

                yield 'brand', dict(
                    company=COMPANY,
                    brand=brand,
                    category=brand_cat,
                    logo_url = sb_to_logo_url.get(smunch(brand)))
Пример #2
0
    def match_twitter_handle(th):
        if th in IGNORE_TWITTER_HANDLES:
            return

        sth = smunch(th[1:])

        for i in range(len(sth), 1, -1):
            if (not th.endswith('Brand') and sth[:i] == sc[:i] and
                'twitter_handle' not in c):
                c['twitter_handle'] = th
                return

            for sb in sbs:
                if sth[:i] == sb[:i] and sb not in sb2th:
                    sb2th[sb] = th
                    return

        else:
            if 'twitter_handle' not in c:
                c['twitter_handle'] = th
            else:
                log.warn(u'No matching brand/company for {} ({}: {})'.format(
                    repr(th), company, u', '.join(brands)))
Пример #3
0
def scrape_company(url, known_brands):
    soup = scrape_soup(url)

    company = strip_company(
            soup.select('#company_score_company_title h1')[0].text)

    c = dict(company=company)

    # rating
    score = int(soup.select('#company_score_score')[0].text.strip())
    r = dict(company=company, score=score, max_score=MAX_SCORE)
    status_path = soup.select('#company_score_status img')[0]['src']
    r['description'], r['judgment'] = scrape_description_and_judgment(
        status_path)
    r['url'] = url

    # icon
    icon_as = soup.select('#company_score_company_icon a')
    if icon_as:
        icon_a = icon_as[0]
        c['url'] = icon_a['href']
        c['logo_url'] = urljoin(url, icon_a.img['src'])

    # sector
    for a in soup.select('#breadcrumbs a'):
        if 'sectors' in a['href']:
            c['category'] = a.text
            break  # ignore "Industry Innovators" category

    # match up brands to logos
    brands = sorted(known_brands[company])
    sb2b = dict((smunch(b), b) for b in brands)

    for img in soup.select('span.brand_icon img'):
        logo_url = urljoin(url, img['src'])
        sb = smunch(img['alt'])
        sb = SMUNCHED_BRAND_CORRECTIONS.get(sb, sb)
        if sb in sb2b:
            brand = sb2b[sb]
            yield 'brand', dict(
                company=company, brand=brand, logo_url=logo_url)
        else:
            log.warn(u'No matching brand for {} ({}: {})'.format(
                repr(img['alt']), company, u', '.join(brands)))

    # match twitter handles to company/brand
    sc = smunch(company)
    sbs = sorted(sb2b)
    sb2th = {}

    twitter_handles = [
        a.text.strip() for a in
        soup.select('#company_score_action_right a')]

    def match_twitter_handle(th):
        if th in IGNORE_TWITTER_HANDLES:
            return

        sth = smunch(th[1:])

        for i in range(len(sth), 1, -1):
            if (not th.endswith('Brand') and sth[:i] == sc[:i] and
                'twitter_handle' not in c):
                c['twitter_handle'] = th
                return

            for sb in sbs:
                if sth[:i] == sb[:i] and sb not in sb2th:
                    sb2th[sb] = th
                    return

        else:
            if 'twitter_handle' not in c:
                c['twitter_handle'] = th
            else:
                log.warn(u'No matching brand/company for {} ({}: {})'.format(
                    repr(th), company, u', '.join(brands)))

    for th in twitter_handles:
        match_twitter_handle(th)

    for sb, th in sb2th.iteritems():
        brand = sb2b[sb]
        yield 'brand', dict(company=company, brand=brand, twitter_handle=th)

    yield 'company', c
    yield 'rating', r

    # parse claims
    for b in soup.find(id='company_score').parent.select('b'):
        m = SUBSCORE_RE.match(b.text)
        if m and isinstance(b.next_sibling, unicode):
            # used this for debugging
            #area = m.group(1)
            area_score = int(m.group(2))
            area_max_score = int(m.group(3))

            raw_claim = b.next_sibling

            if NO_SPLIT_CLAIM_RE.match(raw_claim):
                claims = [raw_claim]
            else:
                claims = list(split_into_sentences(raw_claim))

            for claim in claims:
                # strip company name off claim
                claim = ltrim_sentence(claim, [company, 'the company'])

                judgment = claim_to_judgment(claim)

                # if score is low, maybe it's not so positive after all
                if judgment == 1 and area_score / area_max_score < 0.5:
                    judgment == 0

                yield 'claim', dict(company=company,
                                            claim=claim,
                                            judgment=judgment)
Пример #4
0
SUBSCORE_RE = re.compile(r'^\s*([^:]+):\s+(\d+)/(\d+) points')

NO_SPLIT_CLAIM_RE = re.compile(r'.*\b(also|however)\b.*', re.I)

BRAND_CORRECTIONS = {  # hilarious
    'Climfast': 'Slimfast',
    'Gatoraide': 'Gatorade',
    'Litpon': 'Lipton',
    'Mountain Des': 'Mountain Dew',
    'Wgeaties': 'Wheaties',
    'Talko Bell': 'Taco Bell',
    'Siemans': 'Siemens',
}

SMUNCHED_BRAND_CORRECTIONS = dict(
    (smunch(bad), smunch(good))
    for bad, good in BRAND_CORRECTIONS.iteritems())

IGNORE_TWITTER_HANDLES = {
    '@BritishAirways',  # actually @British_Airways
    '@ShakleeUpdates',  # now @ShakleeHQ
    '@UPS_News',  # ignore in favor of @UPS
    '@theUPSstore_PR',  # ignore in favor of @UPS
    '@',  # derp
}

log = logging.getLogger(__name__)


def scrape_campaign():
    yield 'campaign', CAMPAIGN