def scrape_campaign(): soup = scrape_soup(DIRECTORY_URL) c = { 'campaign': CAMPAIGN, 'url': CAMPAIGN_URL, 'goal': GOAL, 'author': AUTHOR, } c['copyright'] = scrape_copyright(soup) c['facebook_url'] = scrape_facebook_url(soup) c['twitter_handle'] = scrape_twitter_handle(soup) yield 'campaign', c select = soup.find('select', id='edit-field-industry') for option in select.select('option'): industry = option.get('value') if industry: industry_url = '{}?{}={}'.format( DIRECTORY_URL, select['name'], quote_plus(industry)) for record in scrape_industry(industry_url, industry): yield record
def scrape_landing_page(): d = {} soup = scrape_soup(URL) d['signatories_url'] = soup.find('a', text='Signatories')['href'] d['campaign'] = CAMPAIGN d['campaign']['copyright'] = scrape_copyright(soup) d['campaign']['twitter_handle'] = scrape_twitter_handle(soup) # doesn't accept donations; the whole point is that the garment # companies pay return d
def scrape_campaign(): log.info("Main page") soup = scrape_soup(URL) # campaign record cn = {"url": URL, "goal": GOAL} cn["campaign"], cn["author"] = soup.title.text.split("|") # remove double spaces cn["copyright"] = scrape_copyright(soup) cn["facebook_url"] = scrape_facebook_url(soup) cn["twitter_handle"] = scrape_twitter_handle(soup) # get year cn["date"] = INT_RE.search(soup.select("div.content h2")[0].text).group() for a in soup.findAll("a"): if a.text.strip() == "Donate": cn["donate_url"] = urljoin(URL, a["href"]) break if "donate_url" not in cn: raise ValueError("Donate URL not found") yield "campaign", cn rating_divs = soup.select("div#corank div.row") if not rating_divs: raise ValueError("ratings not found") for div in rating_divs: c = {} r = {"company": c} company_a = div.select("a.coname")[0] company = company_a.text c["company"] = company teaser = div.select("span.teaser")[0].text r["categories"] = CATEGORIES_SEP.split(CATEGORIES_RE.match(teaser).group(1)) for rank_class, judgment in RANK_CLASS_TO_JUDGMENT.items(): if div.select("span.rank." + rank_class): r["judgment"] = judgment break else: raise ValueError("rating for {} not found".format(r["company"])) r["score"] = int(INT_RE.search(div.select("div.col_score")[0].text).group()) r["categories"] = [CATEGORY] # fetch details company_id = company_a["href"].split("#")[-1] query = dict(action="getcompany", companyid=company_id) # use POST to get details JSON log.info("Details for {}".format(company)) details = scrape_json(DETAILS_URL, data=urlencode(query)) details = details[0][0] # wrapped in lists. why? c["url"] = details["ext_url"] # TODO: details['message'] might be useful too. It's a message # that participants are supposed to send to the company: # "Thank you for the leadership you have shown in working to..." yield "rating", r detail_soup = BeautifulSoup(details["detail"]) claim_lis = detail_soup.select("li") # First two bullet points are categories and a description # of the company's ranking (reversed for Nokia) # Last bullet point is what the company can do to improve its score. claim_lis = claim_lis[2:-1] for i, claim_li in enumerate(claim_lis): claim = claim_li.text judgment = claim_to_judgment(claim) claim = clarify_claim(claim, CLAIM_CLARIFICATIONS) yield "claim", dict(company=company, claim=claim, judgment=judgment)