Пример #1
0
def extract_claims(remark, company, brand, question=None):
    """Extract and clarify claims from a remark in the sustainability report.
    """
    # references aren't meaningful outside the page
    remark = SEE_RE.sub('', remark).strip()

    for claim in [remark]:  # TODO: split into sentences if appropriate
        claim = remark

        if BAD_CLAIM_RE.match(claim):
            continue

        claim = clarify_claim(claim, CLAIM_CLARIFICATIONS)

        claim = ltrim_sentence(claim, [company, brand])

        claim = claim.strip()

        if claim:
            yield claim
Пример #2
0
def scrape_campaign():
    log.info("Main page")
    soup = scrape_soup(URL)

    # campaign record
    cn = {"url": URL, "goal": GOAL}
    cn["campaign"], cn["author"] = soup.title.text.split("|")
    # remove double spaces

    cn["copyright"] = scrape_copyright(soup)
    cn["facebook_url"] = scrape_facebook_url(soup)
    cn["twitter_handle"] = scrape_twitter_handle(soup)

    # get year
    cn["date"] = INT_RE.search(soup.select("div.content h2")[0].text).group()

    for a in soup.findAll("a"):
        if a.text.strip() == "Donate":
            cn["donate_url"] = urljoin(URL, a["href"])
            break

    if "donate_url" not in cn:
        raise ValueError("Donate URL not found")

    yield "campaign", cn

    rating_divs = soup.select("div#corank div.row")
    if not rating_divs:
        raise ValueError("ratings not found")

    for div in rating_divs:
        c = {}
        r = {"company": c}

        company_a = div.select("a.coname")[0]
        company = company_a.text

        c["company"] = company

        teaser = div.select("span.teaser")[0].text
        r["categories"] = CATEGORIES_SEP.split(CATEGORIES_RE.match(teaser).group(1))

        for rank_class, judgment in RANK_CLASS_TO_JUDGMENT.items():
            if div.select("span.rank." + rank_class):
                r["judgment"] = judgment
                break
        else:
            raise ValueError("rating for {} not found".format(r["company"]))

        r["score"] = int(INT_RE.search(div.select("div.col_score")[0].text).group())

        r["categories"] = [CATEGORY]

        # fetch details
        company_id = company_a["href"].split("#")[-1]
        query = dict(action="getcompany", companyid=company_id)

        # use POST to get details JSON
        log.info("Details for {}".format(company))
        details = scrape_json(DETAILS_URL, data=urlencode(query))
        details = details[0][0]  # wrapped in lists. why?

        c["url"] = details["ext_url"]

        # TODO: details['message'] might be useful too. It's a message
        # that participants are supposed to send to the company:
        # "Thank you for the leadership you have shown in working to..."

        yield "rating", r

        detail_soup = BeautifulSoup(details["detail"])
        claim_lis = detail_soup.select("li")

        # First two bullet points are categories and a description
        # of the company's ranking (reversed for Nokia)
        # Last bullet point is what the company can do to improve its score.
        claim_lis = claim_lis[2:-1]

        for i, claim_li in enumerate(claim_lis):
            claim = claim_li.text

            judgment = claim_to_judgment(claim)

            claim = clarify_claim(claim, CLAIM_CLARIFICATIONS)

            yield "claim", dict(company=company, claim=claim, judgment=judgment)