sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.web import Newsfeed, plaintext, URL from pattern.table import date # This example reads a given RSS or Atom newsfeed channel. # Some sample newsfeeds to try out: NATURE = "http://www.nature.com/nature/current_issue/rss/index.html" SCIENCE = "http://www.sciencemag.org/rss/podcast.xml" HERALD = "http://www.iht.com/rss/frontpage.xml" TIME = "http://feeds.feedburner.com/time/topstories" CNN = "http://rss.cnn.com/rss/edition.rss" engine = Newsfeed() for result in engine.search(CNN, cached=True): print result.title.upper() print plaintext(result.description) # Remove HTML formatting. print result.url print result.date print # Newsfeed item URL's lead to the page with the full article. # Since this page can have any kind of formatting, there is no default way to read it, # but we can simply download the source HTML and convert it to plain text: #html = URL(result.url).download() #print plaintext(html) # The resulting text can contain a lot of garbage. # An better way to do this is to use a DOM parser and select the HTML elements we want. # This is demonstrated in the next example.
import os, sys; sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.web import Newsfeed, plaintext, URL from pattern.table import date # This example reads a given RSS or Atom newsfeed channel. # Some sample newsfeeds to try out: NATURE = "http://www.nature.com/nature/current_issue/rss/index.html" SCIENCE = "http://www.sciencemag.org/rss/podcast.xml" HERALD = "http://www.iht.com/rss/frontpage.xml" TIME = "http://feeds.feedburner.com/time/topstories" CNN = "http://rss.cnn.com/rss/edition.rss" engine = Newsfeed() for result in engine.search(CNN, cached=True): print result.title.upper() print plaintext(result.description) # Remove HTML formatting. print result.url print result.date print # Newsfeed item URL's lead to the page with the full article. # Since this page can have any kind of formatting, there is no default way to read it, # but we can simply download the source HTML and convert it to plain text: #html = URL(result.url).download() #print plaintext(html) # The resulting text can contain a lot of garbage. # An better way to do this is to use a DOM parser and select the HTML elements we want. # This is demonstrated in the next example.
alchemyapi = AlchemyAPI() RSS_LIST = [ (u"Lifehacker", "http://feeds.gawker.com/lifehacker/vip"), (u"The Verge", "http://www.theverge.com/rss/index.xml"), (u"Naukas", "http://naukas.com/feed/"), (u"Zen Habits", "http://feeds.feedburner.com/zenhabits?format=xml"), (u"Yuri", "http://www.lapizarradeyuri.com/feed/"), (u"Menéame", "http://www.meneame.net/rss") ] items = [] for feed in RSS_LIST: feedlist = [] for result in reader.search(feed[1])[:10]: clean_text = plaintext(result.text) response = alchemyapi.entities("text", result.text) entities = [] for entity in response["entities"]: if entity.has_key("disambiguated"): dbpedia_uri = entity["disambiguated"]["dbpedia"] else: dbpedia_uri = None entities.append((entity["text"], dbpedia_uri)) feedlist.append(dict(title=result.title, url=result.url, text=clean_text, entities=entities)) items.append(dict(site=feed[0], feedlist=feedlist)) @app.route('/')
'https://ejbron.wordpress.com/feed/' } PATH = pd('news.csv') try: csv = Datasheet.load(PATH) seen = set(csv.columns[-2]) # use url as id except: csv = Datasheet() seen = set() for (label, name), url in sources.items(): try: f = Newsfeed() f = f.search(url, cached=False) except: continue for r in f: # 1) Download source & parse the HTML tree: try: src = URL(r.url).download(cached=True) dom = DOM(src) except Exception as e: continue # 2) Find article text w/ CSS selectors: for selector in ( "article[class*='node-article']", # The Hill
import os, sys; sys.path.append(os.path.join("..", "..", "..")) from pattern.web import Newsfeed, plaintext, URL from pattern.table import date wsj = "http://online.wsj.com/xml/rss/3_7014.xml" engine = Newsfeed() for result in engine.search(wsj, cached=True): print result.title.upper() print plaintext(result.description) # Remove HTML formatting. print result.url print result.date print