def get_feed(blogurl,bloglist): """ Get a blog and write its contents out to a json file """ try: feed_url = le.extract_feed_link_from_url(blogurl) fp = feedparser.parse(feed_url) except: "Unable to retrieve or parse %s" % blogurl return try: print >> sys.stderr, "Fetched %s entries from '%s'" % (len(fp.entries[0].title.encode('ascii','ignore')), fp.feed.title.encode('ascii','ignore')) except IndexError: print >> sys.stderr, "Retrieved no entries from '%s'" % feed_url return None blog_data = {'blogurl':blogurl,'title': fp.feed.title, 'blogroll':le.extract_links_from_url(blogurl,bloglist)} blog_posts = [blog_data] for e in fp.entries: try: blog_posts.append({'blogtitle':fp.feed.title, 'content': cleanHtml(e.content[0].value), 'link': e.links[0].href, 'links':le.extract_links(e.content[0].value), 'bloglinks':le.extract_links_from_list(e.content[0].value,bloglist) }) except AttributeError: blog_posts.append({'blogtitle':fp.feed.title, 'content': cleanHtml(e.summary), 'link': e.links[0].href, 'links':le.extract_links(e.summary), 'bloglinks':le.extract_links_from_list(e.summary,bloglist) }) if not os.path.isdir('out'): os.mkdir('out') #out_file = '%s__%s.json' % (fp.feed.title.replace("'","").replace("-",""), dt.utcnow()) out_file = '%s.json' % (fm.munge(fp.feed.title)) #out_file = 'foo.json' f = codecs.open(os.path.join(os.getcwd(), 'out', out_file), 'w',encoding='iso-8859-1') f.write(json.dumps(blog_posts)) f.close() print >> sys.stderr, 'Wrote output file to %s' % (f.name, ) return f.name
def test_munge(self): fn = "t\\h!s is.a?bad file'n|ame" self.assertEquals(fm.munge(fn),"t_h_s_is_a_bad_file_n_ame")