Exemplo n.º 1
0
def extract_claims(raw_claim):
    for claim in split_into_sentences(raw_claim):
        if POINTLESS_CLAIM_RE.match(claim):
            continue

        # if sentence starts with "brand" or "it", remove it, and
        # capitalize the next letter
        claim = ltrim_sentence(claim, ('brand', 'it', 'however'))

        yield claim
Exemplo n.º 2
0
def scrape_company(url, known_brands):
    soup = scrape_soup(url)

    company = strip_company(
            soup.select('#company_score_company_title h1')[0].text)

    c = dict(company=company)

    # rating
    score = int(soup.select('#company_score_score')[0].text.strip())
    r = dict(company=company, score=score, max_score=MAX_SCORE)
    status_path = soup.select('#company_score_status img')[0]['src']
    r['description'], r['judgment'] = scrape_description_and_judgment(
        status_path)
    r['url'] = url

    # icon
    icon_as = soup.select('#company_score_company_icon a')
    if icon_as:
        icon_a = icon_as[0]
        c['url'] = icon_a['href']
        c['logo_url'] = urljoin(url, icon_a.img['src'])

    # sector
    for a in soup.select('#breadcrumbs a'):
        if 'sectors' in a['href']:
            c['category'] = a.text
            break  # ignore "Industry Innovators" category

    # match up brands to logos
    brands = sorted(known_brands[company])
    sb2b = dict((smunch(b), b) for b in brands)

    for img in soup.select('span.brand_icon img'):
        logo_url = urljoin(url, img['src'])
        sb = smunch(img['alt'])
        sb = SMUNCHED_BRAND_CORRECTIONS.get(sb, sb)
        if sb in sb2b:
            brand = sb2b[sb]
            yield 'brand', dict(
                company=company, brand=brand, logo_url=logo_url)
        else:
            log.warn(u'No matching brand for {} ({}: {})'.format(
                repr(img['alt']), company, u', '.join(brands)))

    # match twitter handles to company/brand
    sc = smunch(company)
    sbs = sorted(sb2b)
    sb2th = {}

    twitter_handles = [
        a.text.strip() for a in
        soup.select('#company_score_action_right a')]

    def match_twitter_handle(th):
        if th in IGNORE_TWITTER_HANDLES:
            return

        sth = smunch(th[1:])

        for i in range(len(sth), 1, -1):
            if (not th.endswith('Brand') and sth[:i] == sc[:i] and
                'twitter_handle' not in c):
                c['twitter_handle'] = th
                return

            for sb in sbs:
                if sth[:i] == sb[:i] and sb not in sb2th:
                    sb2th[sb] = th
                    return

        else:
            if 'twitter_handle' not in c:
                c['twitter_handle'] = th
            else:
                log.warn(u'No matching brand/company for {} ({}: {})'.format(
                    repr(th), company, u', '.join(brands)))

    for th in twitter_handles:
        match_twitter_handle(th)

    for sb, th in sb2th.iteritems():
        brand = sb2b[sb]
        yield 'brand', dict(company=company, brand=brand, twitter_handle=th)

    yield 'company', c
    yield 'rating', r

    # parse claims
    for b in soup.find(id='company_score').parent.select('b'):
        m = SUBSCORE_RE.match(b.text)
        if m and isinstance(b.next_sibling, unicode):
            # used this for debugging
            #area = m.group(1)
            area_score = int(m.group(2))
            area_max_score = int(m.group(3))

            raw_claim = b.next_sibling

            if NO_SPLIT_CLAIM_RE.match(raw_claim):
                claims = [raw_claim]
            else:
                claims = list(split_into_sentences(raw_claim))

            for claim in claims:
                # strip company name off claim
                claim = ltrim_sentence(claim, [company, 'the company'])

                judgment = claim_to_judgment(claim)

                # if score is low, maybe it's not so positive after all
                if judgment == 1 and area_score / area_max_score < 0.5:
                    judgment == 0

                yield 'claim', dict(company=company,
                                            claim=claim,
                                            judgment=judgment)