Пример #1
0
class IssuesScraper(Scraper):
    def __init__(self):
        Scraper.__init__(self)
        self.url = "https://berniesanders.com/issues/feed/"
        self.html = HTMLParser()
        self.issue_provider = IssueProvider()

    def collect_urls(self):
        records = []
        items = self.get(self.url).findAll("item")
        for item in items:
            record = {
                "title": self.html.unescape(item.title.text),
                "timestamp_publish": parser.parse(item.pubdate.text),
                "site": "berniesanders.com",
                "lang": "en",
                "description_html": item.description.text,
                "description": self.html.unescape(BeautifulSoup(item.description.text).p.text),
                "url": item.link.nextSibling,
            }
            records.append(record)
        return records

    def retrieve(self, record):

        soup = self.get(record["url"])

        # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/>
        meta_image = soup.findAll(attrs={"property": "og:image"})
        record["image_url"] = meta_image[0]["content"].encode("utf8")

        # reset soup to content
        soup = self.sanitize_soup(soup.find("section", {"id": "content"}))
        while soup.article.style is not None:
            soup.article.style.extract()
        record["body_html"] = str(soup.article)
        text = []
        for elem in soup.article.recursiveChildGenerator():
            if isinstance(elem, types.StringTypes):
                text.append(self.html.unescape(elem.strip()))
            elif elem.name == "br":
                text.append("")
        record["body"] = "\n".join(text)

        return record

    def go(self):
        urls = self.collect_urls()
        if not urls:
            logging.critical("Could not retrieve issues.")
            sys.exit(1)
        for url in urls:
            record = self.retrieve(url)
            if self.issue_provider.exists_by_url(record["url"]):
                print "found"
            else:
                msg = "Inserting record for '{0}'."
                logging.info(msg.format(record["title"].encode("utf8")))
                record["timestamp_creation"] = datetime.now()
                self.issue_provider.create(record)
Пример #2
0
 def __init__(self):
     Scraper.__init__(self)
     self.url = "https://berniesanders.com/issues/feed/"
     self.html = HTMLParser()
     self.issue_provider = IssueProvider()
     self.push_provider = PushProvider()
Пример #3
0
 def __init__(self, url):
     Scraper.__init__(self)
     self.url = url
     self.html = HTMLParser()
     self.issue_provider = IssueProvider()
     self.push_provider = PushProvider()
Пример #4
0
 def __init__(self, url):
     Scraper.__init__(self)
     self.url = url
     self.html = HTMLParser()
     self.issue_provider = IssueProvider()
     self.push_provider = PushProvider()
Пример #5
0
class IssuesScraper(Scraper):
    def __init__(self, url):
        Scraper.__init__(self)
        self.url = url
        self.html = HTMLParser()
        self.issue_provider = IssueProvider()
        self.push_provider = PushProvider()

    def collect_urls(self):
        records = []
        r = self.get(self.url)
        try:
            lang = {"en-us": "en", "es-es": "es"}[r.language.string.lower()]
        except KeyError:
            lang = "en"
        items = r.findAll("item")
        for item in items:
            record = {
                "title":
                self.html.unescape(item.title.text),
                "timestamp_publish":
                parser.parse(item.pubdate.text),
                "site":
                "berniesanders.com",
                "lang":
                lang,
                "description":
                self.html.unescape(
                    BeautifulSoup(item.description.text).p.text),
                "url":
                item.link.nextSibling
            }
            records.append(record)
        return records

    def retrieve(self, record):

        soup = self.get(record["url"])

        # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/>
        meta_image = soup.findAll(attrs={"property": "og:image"})
        record["image_url"] = meta_image[0]["content"].encode('utf8')

        # reset soup to content
        soup = self.sanitize_soup(soup.find("section", {"id": "content"}))
        while soup.article.style is not None:
            soup.article.style.extract()
        text = []
        for elem in soup.article.recursiveChildGenerator():
            if isinstance(elem, types.StringTypes):
                text.append(self.html.unescape(elem.strip()))
            elif elem.name == 'br':
                text.append("")
        record["body"] = "\n".join(text)
        record['body_markdown'] = convert_markdown(str(soup.article))

        return record

    def go(self):
        urls = self.collect_urls()
        if not urls:
            logging.critical("Could not retrieve issues.")
            sys.exit(1)
        for url in urls:
            record = self.retrieve(url)
            if self.issue_provider.exists_by_url(record["url"]):
                print "found"
            else:
                msg = "Inserting record for '{0}'."
                logging.info(msg.format(record["title"].encode("utf8")))
                record["timestamp_creation"] = datetime.now()
                result = self.issue_provider.create(record)
Пример #6
0
@app.route('/issue/<uuid:issue_uuid>', methods=['GET', 'POST'])
@auth.login_required
def issue_detail(issue_uuid):
    issue = issue_provider.read(issue_uuid)
    updated = False
    if request.method == 'POST' and issue_provider.update(issue, request):
        updated = True
    return render_template('issue.html', issue=issue, updated=updated)


if __name__ == '__main__':
    try:
        with open('/opt/bernie/config.yml', 'r') as f:
            conf = yaml.load(f)['flask']
    except IOError:
        msg = "Could not open config file: {0}"
        logging.info(msg.format(self.configfile))
        raise
    else:
        event_provider = EventProvider()
        issue_provider = IssueProvider()
        video_provider = VideoProvider()
        article_provider = ArticleProvider()
        news_provider = NewsProvider()
        push_provider = PushProvider()
        users = {conf['httpauth_username']: conf['httpauth_password']}
        app.run(host=conf['host'], debug=conf['debug'])
        register(conf['parse_application_id'], conf['parse_rest_api_key'],
                 conf['parse_master_key'])
        #Push.message("Good morning", channels=["Mike Testing"])
Пример #7
0
 def __init__(self):
     Scraper.__init__(self)
     self.url = "https://berniesanders.com/issues/feed/"
     self.html = HTMLParser()
     self.issue_provider = IssueProvider()