示例#1
0
def feeds_to_trends(feeds):
    for url in feeds:
        url = url['feed_url']
        news = {}
        try:
            for story in Newsfeed().search(url, cached=False):
                d, s = datetext(story.date, story.description)

                # Each key in the news dictionary is a date: news is grouped per day.
                # Each value is a dictionary of id => story items.
                # We use hash(story.description) as a unique id to avoid duplicate
                # content.
                news.setdefault(d, {})[hash(s)] = s

            m = Model()
            for date, stories in news.items():
                s = stories.values()
                s = ' '.join(s).lower()
                # Each day of news is a single document.
                # By adding all documents to a model we can calculate tf-idf.
                m.append(Document(s, stemmer=LEMMA, exclude=[
                         'news', 'day'], name=date))

            for document in m:
                print document.name
                print document.keywords(top=10)
        except HTTP404NotFound:
            print url
            pass
示例#2
0
def articles_to_trends(articles):
    news = {}
    for story in articles:
        if story['added_at']:
            article_text = get_article_text(story['url'])
            d, s = timestamptext(story['added_at'], article_text)

            # Each key in the news dictionary is a date: news is grouped per day.
            # Each value is a dictionary of id => story items.
            # We use hash(story['summary']) as a unique id to avoid duplicate
            # content.
            news.setdefault(d, {})[hash(s)] = s

    m = Model()
    for date, stories in news.items():
        s = stories.values()
        s = ' '.join(s).lower()
        # Each day of news is a single document.
        # By adding all documents to a model we can calculate tf-idf.
        m.append(Document(s, stemmer=LEMMA, exclude=[
                 'news', 'day'], name=date))

    for document in m:
        print document.name
        print document.keywords(top=10)
示例#3
0
文件: ftes.py 项目: kqdtran/FTES
def bag_of_words_tfidf(lst):
    '''
    Constructs a bag of words model, where each document is a Facebook post/comment
    Also applies TFIDF weighting, lemmatization, and filter out stopwords
    '''
    model = Model(documents=[], weight=TFIDF)
    for msg, link in lst:
        doc = Document(msg, stemmer=LEMMA, stopwords=True, name=msg, description=link)
        model.append(doc)
    return model
示例#4
0
def runTFIDF():
  """Given a list of classes, construct a Vector-Space Model.
  We only need to do it once and save it to a pickle file for
  fast loading later on"""

  model = Model(documents=[], weight=TFIDF)
  for r, d, files in os.walk("project/data/"):
    for f in files:
      if f.endswith(".txt"):
        text = readFile(f)
        doc = Document(text, stemmer=LEMMA, stopwords=True, name=f.replace(".txt", ""))
        model.append(doc)
  model.save("project/pickle/course.pic")
示例#5
0
def runTFIDFOnSchedule(term=util.currentTerm, year=util.currentYear):
    """
    Given a list of classes, construct a Vector-Space model
    and apply the TFIDF algorithm to measure similarity between courses
    """

    model = Model(documents=[], weight=TFIDF)
    print "Loading from pickle file..."
    allCourses = loadAllCoursesInTerm()
    print "Begin constructing the Vector Space model"

    for course in allCourses:
        text = course.title + " " + course.description
        doc = Document(text, stemmer=LEMMA, stopwords=True, name=course.title,\
                                     description=course)
        model.append(doc)
    print "Finish processing!!!"
    with open("pickle/simCourses" + term + year + ".pickle", "w") as f:
        dump(model, f, 0)
    return model
示例#6
0
def runTFIDFOnCatalog(term=util.currentTerm, year=util.currentYear):
    """
    Given a dictionary of courses, construct a Vector-Space model
    and apply the TFIDF algorithm to measure similarity between courses.

    We only need to do it once and save it to a pickle file for
    fast loading later on
    """

    model = Model(documents=[], weight=TFIDF)
    print "Loading from pickle file..."
    allCoursesDict = loadCourseCatalog()

    for dept in allCoursesDict:
        print "Processing department", dept
        for course in allCoursesDict[dept]:
            text = course.title + " " + course.description
            doc = Document(text, stemmer=LEMMA, stopwords=True, name=course.title,\
                                         description=course)
            model.append(doc)
        print "Finish processing", dept, "\n"
    with open("pickle/simCatalog" + term + year + ".pickle", "w") as f:
        dump(model, f, 0)
    return model
示例#7
0
文件: 04-KNN.py 项目: ADA110/Cibus
m = Model()
t = Twitter()

# First, we mine a model of a 1000 tweets.
# We'll use hashtags as type.
for page in range(1, 10):
    for tweet in t.search('#win OR #fail', start=page, count=100, cached=True):
        # If the tweet contains #win hashtag, we'll set its type to 'WIN':
        s = tweet.text.lower()  # tweet in lowercase
        p = '#win' in s and 'WIN' or 'FAIL'  # document labels
        s = Sentence(parse(s))  # parse tree with part-of-speech tags
        s = search('JJ', s)  # adjectives in the tweet
        s = [match[0].string for match in s]  # adjectives as a list of strings
        s = " ".join(s)  # adjectives as string
        if len(s) > 0:
            m.append(Document(s, type=p, stemmer=None))

# Train k-Nearest Neighbor on the model.
# Note that this is a only simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if you're classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN(baseline=None)  # By default, baseline=MAJORITY
for document in m:  # (classify unknown documents with the most frequent type).
    classifier.train(document)

# These are the adjectives the classifier has learned:
print sorted(classifier.features)
print

con = pymongo.MongoClient()
sentiment_res = con.tweets.sentiment_analysis
sentiment_res_p = con.tweets.patterns_sentiment_analysis
tweets = con.tweets.tweets_toronto

docs = []
# with open('D:\\data\\documents.spkl', 'wb') as fp:
#     for tweet in tweets.find():
#         doc = Document(tweet['text'],name=tweet['id'])
#         pickle.dump(doc, fp)
#     fp.close()
#

m = Model(documents=[],weight=TFIDF)

with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in range(tweets.count()/100):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in xrange(tweets.count()):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
    print len(m.documents)
m.reduce(dimensions=L2)
m.save
示例#9
0
t = Twitter()

# First, we mine a model of a 1000 tweets.
# We'll use hashtags as type.
for page in range(1, 10):
    for tweet in t.search('#win OR #fail', start=page, count=100, cached=True):
        # If the tweet contains #win hashtag, we'll set its type to 'WIN':
        s = tweet.text.lower()               # tweet in lowercase
        p = '#win' in s and 'WIN' or 'FAIL'  # document labels
        # parse tree with part-of-speech tags
        s = Sentence(parse(s))
        s = search('JJ', s)                  # adjectives in the tweet
        s = [match[0].string for match in s]  # adjectives as a list of strings
        s = " ".join(s)                      # adjectives as string
        if len(s) > 0:
            m.append(Document(s, type=p, stemmer=None))

# Train k-Nearest Neighbor on the model.
# Note that this is a only simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if you're classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN(baseline=None)  # By default, baseline=MAJORITY
# (classify unknown documents with the most frequent type).
for document in m:
    classifier.train(document)

# These are the adjectives the classifier has learned:
print(sorted(classifier.features))
print()
示例#10
0
news, url = {}, 'http://news.google.com/news?output=rss'

for story in Newsfeed().search(url, cached=False):

    d = str(date(story.date, format='%Y-%m-%d'))
    s = plaintext(story.description)

    # Each key in the news dictionary is a date: news is grouped per day.
    # Each value is a dictionary of id => story items.
    # We use hash(story.description) as a unique id to avoid duplicate content.

    news.setdefault(d, {})[hash(s)] = s

# Your code will probably have some preprocessing steps to save and load the mined news updates.

m = Model()

for date, stories in news.items():
    s = stories.values()
    s = ' '.join(s).lower()

    # Each day of news is a single document.
    # By adding all documents to a model we can calculate tf-idf.

    m.append(Document(s, stemmer=LEMMA, exclude=['news', 'day'], name=date))

for document in m:

    print document.name
    print document.keywords(top=10)
示例#11
0
import cPickle as pickle

con = pymongo.MongoClient()
sentiment_res = con.tweets.sentiment_analysis
sentiment_res_p = con.tweets.patterns_sentiment_analysis
tweets = con.tweets.tweets_toronto

docs = []
# with open('D:\\data\\documents.spkl', 'wb') as fp:
#     for tweet in tweets.find():
#         doc = Document(tweet['text'],name=tweet['id'])
#         pickle.dump(doc, fp)
#     fp.close()
#

m = Model(documents=[], weight=TFIDF)

with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in range(tweets.count() / 100):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in xrange(tweets.count()):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
    print len(m.documents)
m.reduce(dimensions=L2)
m.save