示例#1
0
def feeds_to_trends(feeds):
    for url in feeds:
        url = url['feed_url']
        news = {}
        try:
            for story in Newsfeed().search(url, cached=False):
                d, s = datetext(story.date, story.description)

                # Each key in the news dictionary is a date: news is grouped per day.
                # Each value is a dictionary of id => story items.
                # We use hash(story.description) as a unique id to avoid duplicate
                # content.
                news.setdefault(d, {})[hash(s)] = s

            m = Model()
            for date, stories in news.items():
                s = stories.values()
                s = ' '.join(s).lower()
                # Each day of news is a single document.
                # By adding all documents to a model we can calculate tf-idf.
                m.append(Document(s, stemmer=LEMMA, exclude=[
                         'news', 'day'], name=date))

            for document in m:
                print document.name
                print document.keywords(top=10)
        except HTTP404NotFound:
            print url
            pass
示例#2
0
def article_titles(feeds):
    titles = {}
    for key in feeds:
        titles[key] = []
        newsfeed = Newsfeed().search(feeds[key])
        for result in newsfeed:
            titles[key].append(result.title)
    return titles
示例#3
0
import os, sys
sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.web import Newsfeed, plaintext, URL
from pattern.table import date

# This example reads a given RSS or Atom newsfeed channel.
# Some sample newsfeeds to try out:
NATURE = "http://www.nature.com/nature/current_issue/rss/index.html"
SCIENCE = "http://www.sciencemag.org/rss/podcast.xml"
HERALD = "http://www.iht.com/rss/frontpage.xml"
TIME = "http://feeds.feedburner.com/time/topstories"
CNN = "http://rss.cnn.com/rss/edition.rss"

engine = Newsfeed()

for result in engine.search(CNN, cached=True):
    print result.title.upper()
    print plaintext(result.description)  # Remove HTML formatting.
    print result.url
    print result.date
    print

# Newsfeed item URL's lead to the page with the full article.
# Since this page can have any kind of formatting, there is no default way to read it,
# but we can simply download the source HTML and convert it to plain text:
#html = URL(result.url).download()
#print plaintext(html)

# The resulting text can contain a lot of garbage.
# An better way to do this is to use a DOM parser and select the HTML elements we want.
示例#4
0
import os, sys; sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.web   import Newsfeed, plaintext, URL
from pattern.table import date

# This example reads a given RSS or Atom newsfeed channel.
# Some sample newsfeeds to try out:
NATURE  = "http://www.nature.com/nature/current_issue/rss/index.html"
SCIENCE = "http://www.sciencemag.org/rss/podcast.xml"
HERALD  = "http://www.iht.com/rss/frontpage.xml"
TIME    = "http://feeds.feedburner.com/time/topstories"
CNN     = "http://rss.cnn.com/rss/edition.rss"

engine = Newsfeed()

for result in engine.search(CNN, cached=True):
    print result.title.upper()
    print plaintext(result.description) # Remove HTML formatting.
    print result.url
    print result.date
    print

# Newsfeed item URL's lead to the page with the full article.
# Since this page can have any kind of formatting, there is no default way to read it,
# but we can simply download the source HTML and convert it to plain text:
#html = URL(result.url).download()
#print plaintext(html)

# The resulting text can contain a lot of garbage.
# An better way to do this is to use a DOM parser and select the HTML elements we want.
# This is demonstrated in the next example.
示例#5
0
    'geeky': 'http://feeds.feedburner.com/daily-star-Tech',
    'dubious': 'http://feeds.feedburner.com/daily-star-Weird-News',
    'vulgar': 'http://feeds.feedburner.com/daily-star-Love-Sex',
}

PATH = pd('..', 'data', 'news2.csv')  # pd = parent directory of this script

try:
    csv = Datasheet.load(PATH)
    seen = set(csv.columns[0])
except:
    csv = Datasheet()
    seen = set()

for genre, url in feeds.items():
    for r in Newsfeed().search(url, cached=False):
        if r.url not in seen:
            print r.title
            print
            try:
                src = URL(r.url).download(cached=True)
                dom = DOM(src)
                txt = []

                # Daily Star has untidy HTML markup.
                # Collect the article <p> by <p>.
                for p in dom('.story-content p'):
                    if p.parent.tag == 'blockquote':
                        continue
                    s = plaintext(p)
                    s = s.strip()
示例#6
0
文件: crawl.py 项目: LeonieVS/thesis
    (-1, 'ejbron.wordpress.com'):
    'https://ejbron.wordpress.com/feed/'
}

PATH = pd('news.csv')

try:
    csv = Datasheet.load(PATH)
    seen = set(csv.columns[-2])  # use url as id
except:
    csv = Datasheet()
    seen = set()

for (label, name), url in sources.items():
    try:
        f = Newsfeed()
        f = f.search(url, cached=False)
    except:
        continue

    for r in f:

        # 1) Download source & parse the HTML tree:
        try:
            src = URL(r.url).download(cached=True)
            dom = DOM(src)
        except Exception as e:
            continue

        # 2) Find article text w/ CSS selectors:
        for selector in (
示例#7
0
文件: myrss.py 项目: vitojph/myrss
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from flask import Flask
from flask import render_template
from pattern.web import Newsfeed, plaintext
from alchemyapi import AlchemyAPI

app = Flask(__name__)
reader = Newsfeed()
alchemyapi = AlchemyAPI()

RSS_LIST = [
  (u"Lifehacker", "http://feeds.gawker.com/lifehacker/vip"),
  (u"The Verge", "http://www.theverge.com/rss/index.xml"),
  (u"Naukas", "http://naukas.com/feed/"),
  (u"Zen Habits", "http://feeds.feedburner.com/zenhabits?format=xml"),
  (u"Yuri", "http://www.lapizarradeyuri.com/feed/"),
  (u"Menéame", "http://www.meneame.net/rss")
]

items = []

for feed in RSS_LIST:
  feedlist = []
  for result in reader.search(feed[1])[:10]:
    clean_text = plaintext(result.text)
    response = alchemyapi.entities("text", result.text)

    entities = []
    for entity in response["entities"]:
示例#8
0
# techniques like Artificial Intelligence (AI), Machine Learning (ML), mathematical functions, and
# statistical algorithms.
# Pattern is a web mining module for the Python programming language.
# It has tools for data mining (Google, Twitter and Wikipedia API, a web crawler, a HTML DOM parser), natural
# language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), machine learning
# (vector space model, clustering, SVM), network analysis and <canvas> visualization.
# Web mining.
# A simple web mining technique.

from pattern.web import Newsfeed, plaintext
from pattern.db import date
from pattern.vector import Model, Document, LEMMA

news, url = {}, 'http://news.google.com/news?output=rss'

for story in Newsfeed().search(url, cached=False):

    d = str(date(story.date, format='%Y-%m-%d'))
    s = plaintext(story.description)

    # Each key in the news dictionary is a date: news is grouped per day.
    # Each value is a dictionary of id => story items.
    # We use hash(story.description) as a unique id to avoid duplicate content.

    news.setdefault(d, {})[hash(s)] = s

# Your code will probably have some preprocessing steps to save and load the mined news updates.

m = Model()

for date, stories in news.items():
import os, sys; sys.path.append(os.path.join("..", "..", ".."))

from pattern.web   import Newsfeed, plaintext, URL
from pattern.table import date

wsj    = "http://online.wsj.com/xml/rss/3_7014.xml"

engine = Newsfeed()

for result in engine.search(wsj, cached=True):
    print result.title.upper()
    print plaintext(result.description) # Remove HTML formatting.
    print result.url
    print result.date
    print