def scrape_campaign(): soup = scrape_soup(DIRECTORY_URL) c = { 'campaign': CAMPAIGN, 'url': CAMPAIGN_URL, 'goal': GOAL, 'author': AUTHOR, } c['copyright'] = scrape_copyright(soup) c['facebook_url'] = scrape_facebook_url(soup) c['twitter_handle'] = scrape_twitter_handle(soup) yield 'campaign', c select = soup.find('select', id='edit-field-industry') for option in select.select('option'): industry = option.get('value') if industry: industry_url = '{}?{}={}'.format( DIRECTORY_URL, select['name'], quote_plus(industry)) for record in scrape_industry(industry_url, industry): yield record
def scrape_campaign(url=URL): log.info('Landing Page') soup = scrape_soup(url) c = {} # campaign dict c['goal'], c['campaign'] = soup.title.text.split('|')[-2:] c['goal'] = c['goal'].capitalize() # for consistency c['url'] = url # there isn't a copyright notice on the page! c['donate_url'] = urljoin(url, soup.find('a', text='Support us')['href']) c['facebook_url'] = scrape_facebook_url(soup) th = scrape_twitter_handle(soup) c['twitter_handle'] = TWITTER_CORRECTIONS.get(th.lower(), th) yield 'campaign', c for a in soup.select('ul.sectors a'): sector = a.text sector_url = urljoin(url, a['href']) for record in scrape_sector(sector_url, sector): yield record
def main(): opts = parse_args() log_to_stderr(verbose=opts.verbose, quiet=opts.quiet) if opts.urls: all_urls = opts.urls elif environ.get('MORPH_URLS'): all_urls = filter(None, environ['MORPH_URLS'].split()) else: all_urls = set() for db_name in SOURCE_DBS: download_db(db_name) db = open_db(db_name) for table in show_tables(db): if table in SKIP_TABLES: continue urls = select_urls(db, table) if urls: log.info('read {} urls from {}.{}'.format( len(urls), db_name, table)) all_urls.update(urls) create_table_if_not_exists('url', with_scraper_id=False) dt = open_dt() failures = [] # tuple of (url, exception) for i, url in enumerate(sorted(all_urls)): log.info('scraping {} ({} of {})'.format( url, i + 1, len(all_urls))) try: html = scrape(url) soup = BeautifulSoup(html) row = dict(url=url, last_scraped=iso_now()) row['twitter_handle'] = scrape_twitter_handle( soup, required=False) row['facebook_url'] = scrape_facebook_url( soup, required=False) log.debug('`url`: {}'.format(repr(row))) dt.upsert(row, 'url') except Exception as e: failures.append((url, e)) print_exc() # show a summary of failures if failures: log.warn('Failed to scrape {} of {} URL{}:'.format( len(failures), len(all_urls), 's' if len(failures) > 2 else '')) for url, e in failures: log.warn(u' {}: {}'.format(url, repr(e))) if len(failures) > len(all_urls) * MAX_PROPORTION_FAILURES: raise Exception('too many failures')
def scrape_landing_page(): d = {} soup = scrape_soup(URL) d['signatories_url'] = soup.find('a', text='Signatories')['href'] d['campaign'] = CAMPAIGN d['campaign']['copyright'] = scrape_copyright(soup) d['campaign']['twitter_handle'] = scrape_twitter_handle(soup) # doesn't accept donations; the whole point is that the garment # companies pay return d
def scrape_campaign(): soup = scrape_soup(URL) # campaign record c = {'url': URL, 'goal': GOAL} c['campaign'], c['author'] = soup.title.text.split('|') # remove double spaces c['copyright'] = ' '.join( soup.select('#footer ul.privacy')[0].li.stripped_strings) c['twitter_handle'] = scrape_twitter_handle(soup) # TODO: make a method for scraping facebook URLs c['facebook_url'] = soup.select('a.facebook')[0]['href'] c['donate_url'] = urljoin(URL, soup.select('a.donate')[0]['href']) yield 'campaign', c # rating records trs = soup.table.findAll('tr') num_ranked = len(trs) for tr in trs: header_match = HEADER_RE.match(tr.h2.text.strip()) company_in_caps, score, max_score = header_match.groups() score = Decimal(score) max_score = int(max_score) judgment = score_to_judgment(score) rank = int(IMG_RE.match(tr.img['alt'].strip()).group(1)) # get company name not in ALL CAPS company = REPORT_CARD_RE.match((tr.a.text.strip())).group(1) if company.upper() != company_in_caps.upper(): raise ValueError(u"Non-matching company name: {}".format(company)) yield 'rating', { 'company': company, 'score': score, 'max_score': max_score, 'rank': rank, 'num_ranked': num_ranked, 'judgment': judgment, 'categories': [CATEGORY], }
def do_corp(url, industry): biz_id = url.split('/')[-1] # whitelist of businesses if 'MORPH_B_CORP_BIZ_IDS' in environ: if biz_id not in environ['MORPH_B_CORP_BIZ_IDS'].split(','): return log.info('Business page: {}'.format(biz_id)) try: html = scrape(url) except HTTPError as e: if 'infinite loop' in e.msg: log.warn('infinite loop when fetching {}'.format(url)) return elif e.code == 403 and e.geturl() != url: log.warn('redirect to bad URL: {}'.format(url)) return else: raise soup = BeautifulSoup(html) c = {} # just being in the directory gets you a good judgment r = {'judgment': 1, 'company': c, 'url': url} # scrape score anyway # some pages don't have score (e.g. # http://www.bcorporation.net/community/farm-capital-services-llc-0) score_div = soup.find('div', class_='field-name-field-overall-b-score') if score_div: r['score'] = int(score_div.text) r['max_score'] = MAX_SCORE c['company'] = soup.select('h1#page-title')[0].text # use both industry and category on page (industry is more consistent) c['categories'] = [industry] # *almost* all bizs have their own category description, but not all category_h3s = soup.select('.company-desc-inner h3') if category_h3s: cat = category_h3s[0].text.strip() if cat: c['categories'].append(cat) # social media left_col = soup.select('.two-col.last')[0] c['twitter_handle'] = scrape_twitter_handle(left_col, required=False) c['facebook_url'] = scrape_facebook_url(left_col, required=False) homepage_as = soup.select('.company-desc-inner a') if homepage_as: c['url'] = homepage_as[0]['href'] # logo not always available; e.g. on # http://www.bcorporation.net/community/atayne-llc logo_img = soup.find('img', class_='image-style-company-logo-full') if logo_img: c['logo_url'] = urljoin(url, logo_img['src']) # TODO: add store_url. This is in the lower-right box, # but not consistently formatted. Examples: # http://www.bcorporation.net/community/one-village-coffee-llc # http://www.bcorporation.net/community/feelgoodz-llc # turn Company Highlights into claims ch_section = soup.find( 'section', class_='field-name-field-company-highlights') if ch_section: claims = [] for strong in ch_section.select('strong'): if isinstance(strong.nextSibling, unicode): # the colon for the heading isn't inside <strong> claims.extend(strong.nextSibling.lstrip(':').split(';')) elif strong.nextSibling is None: claims.extend(strong.stripped_strings) for claim in claims: claim = claim.strip() if claim: yield 'claim', dict( company=c['company'], claim=claim, judgment=1) yield 'rating', r
def scrape_campaign(): log.info("Main page") soup = scrape_soup(URL) # campaign record cn = {"url": URL, "goal": GOAL} cn["campaign"], cn["author"] = soup.title.text.split("|") # remove double spaces cn["copyright"] = scrape_copyright(soup) cn["facebook_url"] = scrape_facebook_url(soup) cn["twitter_handle"] = scrape_twitter_handle(soup) # get year cn["date"] = INT_RE.search(soup.select("div.content h2")[0].text).group() for a in soup.findAll("a"): if a.text.strip() == "Donate": cn["donate_url"] = urljoin(URL, a["href"]) break if "donate_url" not in cn: raise ValueError("Donate URL not found") yield "campaign", cn rating_divs = soup.select("div#corank div.row") if not rating_divs: raise ValueError("ratings not found") for div in rating_divs: c = {} r = {"company": c} company_a = div.select("a.coname")[0] company = company_a.text c["company"] = company teaser = div.select("span.teaser")[0].text r["categories"] = CATEGORIES_SEP.split(CATEGORIES_RE.match(teaser).group(1)) for rank_class, judgment in RANK_CLASS_TO_JUDGMENT.items(): if div.select("span.rank." + rank_class): r["judgment"] = judgment break else: raise ValueError("rating for {} not found".format(r["company"])) r["score"] = int(INT_RE.search(div.select("div.col_score")[0].text).group()) r["categories"] = [CATEGORY] # fetch details company_id = company_a["href"].split("#")[-1] query = dict(action="getcompany", companyid=company_id) # use POST to get details JSON log.info("Details for {}".format(company)) details = scrape_json(DETAILS_URL, data=urlencode(query)) details = details[0][0] # wrapped in lists. why? c["url"] = details["ext_url"] # TODO: details['message'] might be useful too. It's a message # that participants are supposed to send to the company: # "Thank you for the leadership you have shown in working to..." yield "rating", r detail_soup = BeautifulSoup(details["detail"]) claim_lis = detail_soup.select("li") # First two bullet points are categories and a description # of the company's ranking (reversed for Nokia) # Last bullet point is what the company can do to improve its score. claim_lis = claim_lis[2:-1] for i, claim_li in enumerate(claim_lis): claim = claim_li.text judgment = claim_to_judgment(claim) claim = clarify_claim(claim, CLAIM_CLARIFICATIONS) yield "claim", dict(company=company, claim=claim, judgment=judgment)