Пример #1
0
class FreakonomicsParser(object):
    URL = u"http://feedproxy.google.com/freakonomicsblog"

    def __init__(self):
        self.agent = Agent()

    def fetch_url(self):
        return self.agent.fetch(self.URL)

    def parse(self):
        tree = etree.fromstring(self.fetch_url(), base_url=self.URL)

        self.f = feedgenerator.Atom1Feed(
            title=s(tree.xpath('/rss/channel/title/text()')),
            link=s(tree.xpath('/rss/channel/link/text()')),
            description=s(tree.xpath('/rss/channel/description/text()')),
            language=s(tree.xpath('/rss/channel/language/text()')))

        for item in tree.xpath('/rss/channel/item'):
            self.parse_item(item)

        return self.f

    def parse_item(self, item):
        n = {'feedburner':'http://rssnamespace.org/feedburner/ext/1.0',
             'dc': 'http://purl.org/dc/elements/1.1/'}

        title = s(item.xpath('title/text()'))
        link = s(item.xpath('feedburner:origLink/text()', namespaces=n))
        pubdate = convert_date(s(item.xpath('pubDate/text()')))
        author_name = s(item.xpath('dc:creator/text()', namespaces=n))
        categories = item.xpath('category/text()')

        if cache.has(link+"#description", dynamic=False):
            description = cache.get(link+"#description")
        else:
            soup = BeautifulSoup(self.agent.fetch(link))
            div = soup.find('div', {'class': 'entry-content'})
            description = div.prettify()
            cache.set(link+"#description", description)

        self.f.add_item(title, link, description, author_name=author_name,
            pubdate=pubdate, categories=categories)
Пример #2
0
 def __init__(self):
     self.agent = Agent()