示例#1
0
def judge_claim(claim):
    """Process claim text, and handle some special cases."""
    judgment = claim_to_judgment(claim)

    if judgment != 0 and CLAIM_NOT_GOOD_RE.match(claim):
        judgment = -1

    if judgment == 1 and '%' in claim and '100%' not in claim:
        judgment = 0

    return judgment
def scrape_company(url, known_brands):
    soup = scrape_soup(url)

    company = strip_company(
            soup.select('#company_score_company_title h1')[0].text)

    c = dict(company=company)

    # rating
    score = int(soup.select('#company_score_score')[0].text.strip())
    r = dict(company=company, score=score, max_score=MAX_SCORE)
    status_path = soup.select('#company_score_status img')[0]['src']
    r['description'], r['judgment'] = scrape_description_and_judgment(
        status_path)
    r['url'] = url

    # icon
    icon_as = soup.select('#company_score_company_icon a')
    if icon_as:
        icon_a = icon_as[0]
        c['url'] = icon_a['href']
        c['logo_url'] = urljoin(url, icon_a.img['src'])

    # sector
    for a in soup.select('#breadcrumbs a'):
        if 'sectors' in a['href']:
            c['category'] = a.text
            break  # ignore "Industry Innovators" category

    # match up brands to logos
    brands = sorted(known_brands[company])
    sb2b = dict((smunch(b), b) for b in brands)

    for img in soup.select('span.brand_icon img'):
        logo_url = urljoin(url, img['src'])
        sb = smunch(img['alt'])
        sb = SMUNCHED_BRAND_CORRECTIONS.get(sb, sb)
        if sb in sb2b:
            brand = sb2b[sb]
            yield 'brand', dict(
                company=company, brand=brand, logo_url=logo_url)
        else:
            log.warn(u'No matching brand for {} ({}: {})'.format(
                repr(img['alt']), company, u', '.join(brands)))

    # match twitter handles to company/brand
    sc = smunch(company)
    sbs = sorted(sb2b)
    sb2th = {}

    twitter_handles = [
        a.text.strip() for a in
        soup.select('#company_score_action_right a')]

    def match_twitter_handle(th):
        if th in IGNORE_TWITTER_HANDLES:
            return

        sth = smunch(th[1:])

        for i in range(len(sth), 1, -1):
            if (not th.endswith('Brand') and sth[:i] == sc[:i] and
                'twitter_handle' not in c):
                c['twitter_handle'] = th
                return

            for sb in sbs:
                if sth[:i] == sb[:i] and sb not in sb2th:
                    sb2th[sb] = th
                    return

        else:
            if 'twitter_handle' not in c:
                c['twitter_handle'] = th
            else:
                log.warn(u'No matching brand/company for {} ({}: {})'.format(
                    repr(th), company, u', '.join(brands)))

    for th in twitter_handles:
        match_twitter_handle(th)

    for sb, th in sb2th.iteritems():
        brand = sb2b[sb]
        yield 'brand', dict(company=company, brand=brand, twitter_handle=th)

    yield 'company', c
    yield 'rating', r

    # parse claims
    for b in soup.find(id='company_score').parent.select('b'):
        m = SUBSCORE_RE.match(b.text)
        if m and isinstance(b.next_sibling, unicode):
            # used this for debugging
            #area = m.group(1)
            area_score = int(m.group(2))
            area_max_score = int(m.group(3))

            raw_claim = b.next_sibling

            if NO_SPLIT_CLAIM_RE.match(raw_claim):
                claims = [raw_claim]
            else:
                claims = list(split_into_sentences(raw_claim))

            for claim in claims:
                # strip company name off claim
                claim = ltrim_sentence(claim, [company, 'the company'])

                judgment = claim_to_judgment(claim)

                # if score is low, maybe it's not so positive after all
                if judgment == 1 and area_score / area_max_score < 0.5:
                    judgment == 0

                yield 'claim', dict(company=company,
                                            claim=claim,
                                            judgment=judgment)
示例#3
0
def scrape_campaign():
    log.info("Main page")
    soup = scrape_soup(URL)

    # campaign record
    cn = {"url": URL, "goal": GOAL}
    cn["campaign"], cn["author"] = soup.title.text.split("|")
    # remove double spaces

    cn["copyright"] = scrape_copyright(soup)
    cn["facebook_url"] = scrape_facebook_url(soup)
    cn["twitter_handle"] = scrape_twitter_handle(soup)

    # get year
    cn["date"] = INT_RE.search(soup.select("div.content h2")[0].text).group()

    for a in soup.findAll("a"):
        if a.text.strip() == "Donate":
            cn["donate_url"] = urljoin(URL, a["href"])
            break

    if "donate_url" not in cn:
        raise ValueError("Donate URL not found")

    yield "campaign", cn

    rating_divs = soup.select("div#corank div.row")
    if not rating_divs:
        raise ValueError("ratings not found")

    for div in rating_divs:
        c = {}
        r = {"company": c}

        company_a = div.select("a.coname")[0]
        company = company_a.text

        c["company"] = company

        teaser = div.select("span.teaser")[0].text
        r["categories"] = CATEGORIES_SEP.split(CATEGORIES_RE.match(teaser).group(1))

        for rank_class, judgment in RANK_CLASS_TO_JUDGMENT.items():
            if div.select("span.rank." + rank_class):
                r["judgment"] = judgment
                break
        else:
            raise ValueError("rating for {} not found".format(r["company"]))

        r["score"] = int(INT_RE.search(div.select("div.col_score")[0].text).group())

        r["categories"] = [CATEGORY]

        # fetch details
        company_id = company_a["href"].split("#")[-1]
        query = dict(action="getcompany", companyid=company_id)

        # use POST to get details JSON
        log.info("Details for {}".format(company))
        details = scrape_json(DETAILS_URL, data=urlencode(query))
        details = details[0][0]  # wrapped in lists. why?

        c["url"] = details["ext_url"]

        # TODO: details['message'] might be useful too. It's a message
        # that participants are supposed to send to the company:
        # "Thank you for the leadership you have shown in working to..."

        yield "rating", r

        detail_soup = BeautifulSoup(details["detail"])
        claim_lis = detail_soup.select("li")

        # First two bullet points are categories and a description
        # of the company's ranking (reversed for Nokia)
        # Last bullet point is what the company can do to improve its score.
        claim_lis = claim_lis[2:-1]

        for i, claim_li in enumerate(claim_lis):
            claim = claim_li.text

            judgment = claim_to_judgment(claim)

            claim = clarify_claim(claim, CLAIM_CLARIFICATIONS)

            yield "claim", dict(company=company, claim=claim, judgment=judgment)