def feeds_to_trends(feeds): for url in feeds: url = url['feed_url'] news = {} try: for story in Newsfeed().search(url, cached=False): d, s = datetext(story.date, story.description) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story.description) as a unique id to avoid duplicate # content. news.setdefault(d, {})[hash(s)] = s m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower() # Each day of news is a single document. # By adding all documents to a model we can calculate tf-idf. m.append(Document(s, stemmer=LEMMA, exclude=[ 'news', 'day'], name=date)) for document in m: print document.name print document.keywords(top=10) except HTTP404NotFound: print url pass
def articles_to_trends(articles): news = {} for story in articles: if story['added_at']: article_text = get_article_text(story['url']) d, s = timestamptext(story['added_at'], article_text) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story['summary']) as a unique id to avoid duplicate # content. news.setdefault(d, {})[hash(s)] = s m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower() # Each day of news is a single document. # By adding all documents to a model we can calculate tf-idf. m.append(Document(s, stemmer=LEMMA, exclude=[ 'news', 'day'], name=date)) for document in m: print document.name print document.keywords(top=10)
def bag_of_words_tfidf(lst): ''' Constructs a bag of words model, where each document is a Facebook post/comment Also applies TFIDF weighting, lemmatization, and filter out stopwords ''' model = Model(documents=[], weight=TFIDF) for msg, link in lst: doc = Document(msg, stemmer=LEMMA, stopwords=True, name=msg, description=link) model.append(doc) return model
def runTFIDF(): """Given a list of classes, construct a Vector-Space Model. We only need to do it once and save it to a pickle file for fast loading later on""" model = Model(documents=[], weight=TFIDF) for r, d, files in os.walk("project/data/"): for f in files: if f.endswith(".txt"): text = readFile(f) doc = Document(text, stemmer=LEMMA, stopwords=True, name=f.replace(".txt", "")) model.append(doc) model.save("project/pickle/course.pic")
def runTFIDFOnSchedule(term=util.currentTerm, year=util.currentYear): """ Given a list of classes, construct a Vector-Space model and apply the TFIDF algorithm to measure similarity between courses """ model = Model(documents=[], weight=TFIDF) print "Loading from pickle file..." allCourses = loadAllCoursesInTerm() print "Begin constructing the Vector Space model" for course in allCourses: text = course.title + " " + course.description doc = Document(text, stemmer=LEMMA, stopwords=True, name=course.title,\ description=course) model.append(doc) print "Finish processing!!!" with open("pickle/simCourses" + term + year + ".pickle", "w") as f: dump(model, f, 0) return model
def runTFIDFOnCatalog(term=util.currentTerm, year=util.currentYear): """ Given a dictionary of courses, construct a Vector-Space model and apply the TFIDF algorithm to measure similarity between courses. We only need to do it once and save it to a pickle file for fast loading later on """ model = Model(documents=[], weight=TFIDF) print "Loading from pickle file..." allCoursesDict = loadCourseCatalog() for dept in allCoursesDict: print "Processing department", dept for course in allCoursesDict[dept]: text = course.title + " " + course.description doc = Document(text, stemmer=LEMMA, stopwords=True, name=course.title,\ description=course) model.append(doc) print "Finish processing", dept, "\n" with open("pickle/simCatalog" + term + year + ".pickle", "w") as f: dump(model, f, 0) return model
m = Model() t = Twitter() # First, we mine a model of a 1000 tweets. # We'll use hashtags as type. for page in range(1, 10): for tweet in t.search('#win OR #fail', start=page, count=100, cached=True): # If the tweet contains #win hashtag, we'll set its type to 'WIN': s = tweet.text.lower() # tweet in lowercase p = '#win' in s and 'WIN' or 'FAIL' # document labels s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: m.append(Document(s, type=p, stemmer=None)) # Train k-Nearest Neighbor on the model. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN(baseline=None) # By default, baseline=MAJORITY for document in m: # (classify unknown documents with the most frequent type). classifier.train(document) # These are the adjectives the classifier has learned: print sorted(classifier.features) print
con = pymongo.MongoClient() sentiment_res = con.tweets.sentiment_analysis sentiment_res_p = con.tweets.patterns_sentiment_analysis tweets = con.tweets.tweets_toronto docs = [] # with open('D:\\data\\documents.spkl', 'wb') as fp: # for tweet in tweets.find(): # doc = Document(tweet['text'],name=tweet['id']) # pickle.dump(doc, fp) # fp.close() # m = Model(documents=[],weight=TFIDF) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in range(tweets.count()/100): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in xrange(tweets.count()): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) print len(m.documents) m.reduce(dimensions=L2) m.save
t = Twitter() # First, we mine a model of a 1000 tweets. # We'll use hashtags as type. for page in range(1, 10): for tweet in t.search('#win OR #fail', start=page, count=100, cached=True): # If the tweet contains #win hashtag, we'll set its type to 'WIN': s = tweet.text.lower() # tweet in lowercase p = '#win' in s and 'WIN' or 'FAIL' # document labels # parse tree with part-of-speech tags s = Sentence(parse(s)) s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: m.append(Document(s, type=p, stemmer=None)) # Train k-Nearest Neighbor on the model. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN(baseline=None) # By default, baseline=MAJORITY # (classify unknown documents with the most frequent type). for document in m: classifier.train(document) # These are the adjectives the classifier has learned: print(sorted(classifier.features)) print()
news, url = {}, 'http://news.google.com/news?output=rss' for story in Newsfeed().search(url, cached=False): d = str(date(story.date, format='%Y-%m-%d')) s = plaintext(story.description) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story.description) as a unique id to avoid duplicate content. news.setdefault(d, {})[hash(s)] = s # Your code will probably have some preprocessing steps to save and load the mined news updates. m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower() # Each day of news is a single document. # By adding all documents to a model we can calculate tf-idf. m.append(Document(s, stemmer=LEMMA, exclude=['news', 'day'], name=date)) for document in m: print document.name print document.keywords(top=10)
import cPickle as pickle con = pymongo.MongoClient() sentiment_res = con.tweets.sentiment_analysis sentiment_res_p = con.tweets.patterns_sentiment_analysis tweets = con.tweets.tweets_toronto docs = [] # with open('D:\\data\\documents.spkl', 'wb') as fp: # for tweet in tweets.find(): # doc = Document(tweet['text'],name=tweet['id']) # pickle.dump(doc, fp) # fp.close() # m = Model(documents=[], weight=TFIDF) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in range(tweets.count() / 100): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in xrange(tweets.count()): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) print len(m.documents) m.reduce(dimensions=L2) m.save