class IssuesScraper(Scraper): def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/issues/feed/" self.html = HTMLParser() self.issue_provider = IssueProvider() def collect_urls(self): records = [] items = self.get(self.url).findAll("item") for item in items: record = { "title": self.html.unescape(item.title.text), "timestamp_publish": parser.parse(item.pubdate.text), "site": "berniesanders.com", "lang": "en", "description_html": item.description.text, "description": self.html.unescape(BeautifulSoup(item.description.text).p.text), "url": item.link.nextSibling, } records.append(record) return records def retrieve(self, record): soup = self.get(record["url"]) # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/> meta_image = soup.findAll(attrs={"property": "og:image"}) record["image_url"] = meta_image[0]["content"].encode("utf8") # reset soup to content soup = self.sanitize_soup(soup.find("section", {"id": "content"})) while soup.article.style is not None: soup.article.style.extract() record["body_html"] = str(soup.article) text = [] for elem in soup.article.recursiveChildGenerator(): if isinstance(elem, types.StringTypes): text.append(self.html.unescape(elem.strip())) elif elem.name == "br": text.append("") record["body"] = "\n".join(text) return record def go(self): urls = self.collect_urls() if not urls: logging.critical("Could not retrieve issues.") sys.exit(1) for url in urls: record = self.retrieve(url) if self.issue_provider.exists_by_url(record["url"]): print "found" else: msg = "Inserting record for '{0}'." logging.info(msg.format(record["title"].encode("utf8"))) record["timestamp_creation"] = datetime.now() self.issue_provider.create(record)
def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/issues/feed/" self.html = HTMLParser() self.issue_provider = IssueProvider() self.push_provider = PushProvider()
def __init__(self, url): Scraper.__init__(self) self.url = url self.html = HTMLParser() self.issue_provider = IssueProvider() self.push_provider = PushProvider()
class IssuesScraper(Scraper): def __init__(self, url): Scraper.__init__(self) self.url = url self.html = HTMLParser() self.issue_provider = IssueProvider() self.push_provider = PushProvider() def collect_urls(self): records = [] r = self.get(self.url) try: lang = {"en-us": "en", "es-es": "es"}[r.language.string.lower()] except KeyError: lang = "en" items = r.findAll("item") for item in items: record = { "title": self.html.unescape(item.title.text), "timestamp_publish": parser.parse(item.pubdate.text), "site": "berniesanders.com", "lang": lang, "description": self.html.unescape( BeautifulSoup(item.description.text).p.text), "url": item.link.nextSibling } records.append(record) return records def retrieve(self, record): soup = self.get(record["url"]) # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/> meta_image = soup.findAll(attrs={"property": "og:image"}) record["image_url"] = meta_image[0]["content"].encode('utf8') # reset soup to content soup = self.sanitize_soup(soup.find("section", {"id": "content"})) while soup.article.style is not None: soup.article.style.extract() text = [] for elem in soup.article.recursiveChildGenerator(): if isinstance(elem, types.StringTypes): text.append(self.html.unescape(elem.strip())) elif elem.name == 'br': text.append("") record["body"] = "\n".join(text) record['body_markdown'] = convert_markdown(str(soup.article)) return record def go(self): urls = self.collect_urls() if not urls: logging.critical("Could not retrieve issues.") sys.exit(1) for url in urls: record = self.retrieve(url) if self.issue_provider.exists_by_url(record["url"]): print "found" else: msg = "Inserting record for '{0}'." logging.info(msg.format(record["title"].encode("utf8"))) record["timestamp_creation"] = datetime.now() result = self.issue_provider.create(record)
@app.route('/issue/<uuid:issue_uuid>', methods=['GET', 'POST']) @auth.login_required def issue_detail(issue_uuid): issue = issue_provider.read(issue_uuid) updated = False if request.method == 'POST' and issue_provider.update(issue, request): updated = True return render_template('issue.html', issue=issue, updated=updated) if __name__ == '__main__': try: with open('/opt/bernie/config.yml', 'r') as f: conf = yaml.load(f)['flask'] except IOError: msg = "Could not open config file: {0}" logging.info(msg.format(self.configfile)) raise else: event_provider = EventProvider() issue_provider = IssueProvider() video_provider = VideoProvider() article_provider = ArticleProvider() news_provider = NewsProvider() push_provider = PushProvider() users = {conf['httpauth_username']: conf['httpauth_password']} app.run(host=conf['host'], debug=conf['debug']) register(conf['parse_application_id'], conf['parse_rest_api_key'], conf['parse_master_key']) #Push.message("Good morning", channels=["Mike Testing"])
def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/issues/feed/" self.html = HTMLParser() self.issue_provider = IssueProvider()