Exemplo n.º 1
0
def scrape_brand(url, sectors, soup=None):
    if soup is None:
        soup = scrape_soup(url)

    b = {}  # brand dict

    # brand
    brand_a = soup.select('dl.brands a')[0]
    b['brand'] = brand_a.dt.text
    log.info(u'Brand: {}'.format(b['brand']))

    # company
    sidebar_final_p = soup.select('#main div')[0].select('p')[-1]
    info_strs = list(sidebar_final_p.stripped_strings)
    i = info_strs.index('Brand owner:')
    b['company'] = info_strs[i + 1]

    # logo URL
    logo_imgs = soup.select('div.logobox img')
    if logo_imgs:
        logo_img = logo_imgs[0]
        b['logo_url'] = repair_url(urljoin(url, logo_img['src']))

    # category stuff
    sectors = correct_sectors(sectors)
    b['category'] = sectors[-1]
    for i in range(len(sectors) - 1):
        yield 'subcategory', dict(category=sectors[i],
                                  subcategory=sectors[i + 1])

    # twitter handle
    for a in soup.select('ol#do-something a'):
        if a.text.strip().startswith('Nudge '):
            nudge_url = urljoin(url, a['href'])
            b['twitter_handle'] = (
                scrape_twitter_handle_from_nudge_url(nudge_url))

    # done with brand
    yield 'brand', b

    # rated? if not, bail out (see #10)
    rating_span = brand_a.span
    if any(c.startswith('not-ranked') for c in rating_span['class']):
        return

    # rating dict
    r = {'brand': b['brand'], 'company': b['company'], 'url': url}

    r['grade'] = rating_span['alt']
    r['judgment'] = grade_to_judgment(r['grade'])
    r['description'] = rating_span['title']

    # score
    score_a = soup.find('a', href='#detailed-report')
    score_parts = score_a.text.strip().split()
    r['score'] = int(score_parts[0])
    r['max_score'] = int(score_parts[-1])

    # last edited date
    brand_change_label = soup.find('span', class_='brand_change_label')
    m = DATE_RE.search(brand_change_label.text)
    if m:
        edit_date = parse_date(m.group(0))
        r['date'] = to_iso_date(edit_date)

    # rating scraped!
    yield 'rating', r

    # include claims from sustainability report
    for claim in scrape_claims(url, b['company'], b['brand'], soup):
        yield 'claim', claim
Exemplo n.º 2
0
def scrape_rating_page(rating_id):
    url = RATINGS_URL + str(rating_id)
    soup = BeautifulSoup(scrape(url, headers={}), from_encoding='utf-8')

    d = {}
    d['url'] = url

    # handle header field (brand)
    brand = soup.select('.rating-name')[0].text.strip()
    log.info('Rating {}: {}'.format(rating_id, brand))

    # get logo image
    logo_url = None
    brand_logo_img = soup.find('img', alt='brand logo')
    if brand_logo_img and 'src' in brand_logo_img.attrs:
        logo_url = brand_logo_img['src']

    for suffix in SUFFIXES:
        if brand.endswith(suffix):
            brand = brand[:-len(suffix)]
            d.update(SUFFIXES[suffix])
            break
    d['brand'] = brand

    h3_spans = {
        span.text.strip().lower(): span
        for span in soup.select('td h3 span')
    }

    scope_span = h3_spans['scope']
    scope_table = scope_span.find_parent('table')

    scope_tds = scope_table.select('tr td[colspan=3]')

    # handle "Rating applies to these products/ lines" field
    scope = scope_tds[0].text.strip()
    # fix dangling comma on "Woolworths manufactured apparel,"
    scope = scope.rstrip(',')

    if scope in SCOPE_CORRECTIONS:
        d.update(SCOPE_CORRECTIONS[scope])
    elif scope:
        d['scope'] = scope

    # handle "Rating based on assessment of" field
    company = scope_tds[1].text.strip()
    # fix e.g. "Clean Clothes, Inc.: Maggie's Organics"
    if company.endswith(': ' + brand):
        company = company[:-(2 + len(brand))]

    for prefix in COMPANY_PREFIXES:
        if company.startswith(prefix):
            company = company[len(prefix):].rstrip(')')
            d.update(COMPANY_PREFIXES[prefix])
            break
    for suffix in SUFFIXES:
        if company.endswith(suffix):
            company = company[:-len(suffix)]
            d.update(SUFFIXES[suffix])
            break

    # handle empty company field (e.g. Frontier)
    if not company:
        company = brand

    if company in COMPANY_CORRECTIONS:
        d.update(COMPANY_CORRECTIONS[company])
    else:
        d['company'] = company

    # handle "Industries" field
    #
    # in cases where a company is rated, this seems to be attached to
    # the company, not the specific brands, so it's okay to just
    # add this to the rating (whether it's a company or brand rating)
    categories = scope_tds[2].text.strip()
    if categories:
        d['categories'] = [c.strip() for c in categories.split(',')]

    # handle "Date Published" field
    date = to_iso_date(scope_tds[3].text.strip())
    # if no date, guess based on relevant report
    if not date and d.get('categories'):
        for category, year in REPORT_YEARS:
            if category in d['categories']:
                date = str(year)
                break

    if date is not None:
        d['date'] = date

    # handle grades
    gb_span = h3_spans['grade breakdown']
    gb_tr = gb_span.find_parent('tr').find_next_sibling('tr')

    area_to_grade = {}
    for grade_span in gb_tr.select('span.grade_circle'):
        area = grade_span.next_sibling
        if not isinstance(area, unicode):
            area = area.text  # "Overall" is bolded, others are not
        area = area.lower().strip()
        grade = grade_span.text
        area_to_grade[area] = grade

    d['grade'] = area_to_grade['overall']

    # convert to judgment
    d['judgment'] = grade_to_judgment(d['grade'])

    # attach logo_url to brand or company as appropriate
    if logo_url:
        if 'brand' in d and 'rating_brands' not in d:
            yield 'brand', dict(
                company=d['company'], brand=d['brand'], logo_url=logo_url)
        else:
            yield 'company', dict(
                company=d['company'], logo_url=logo_url)

    # work out claims
    claims = []

    about_span = h3_spans.get('about this rating')
    if about_span:  # not all companies have this
        about_text = about_span.find_parent(
            'tbody').find_next_sibling('tbody').text

        # about_text looks like POLICIES: stuff. TRANSPARENCY: more stuff ...
        # need to convert this to area -> claim

        areas = []
        starts = []
        ends = []

        for m in CLAIM_AREA_RE.finditer(about_text):
            areas.append(m.group(1).lower())
            starts.append(m.start())
            ends.append(m.end())

        for area, start, end in zip(areas, ends, starts[1:] + [-1]):
            area_claim = about_text[start:end]

            for claim in extract_claims(area_claim):
                judgment = judge_claim(claim)

                claims.append(
                    dict(company=company, claim=claim, judgment=judgment))

    # rate company or brands as appropriate
    if 'rating_brands' in d:
        rating_brands = d.pop('rating_brands')
        for rating_brand in rating_brands:
            rating = d.copy()
            rating['brand'] = rating_brand
            yield 'rating', rating

            for claim in claims:
                claim = claim.copy()
                claim['brand'] = rating_brand
                yield 'claim', claim
    else:
        rating = d.copy()
        if 'brand' in rating:
            rating['brands'] = [rating.pop('brand')]
        yield 'rating', rating
        for claim in claims:
            yield 'claim', claim