from rdt.data.mongo.features import Features import sys if __name__ == "__main__": subreddit = sys.argv[1] fts = Features(host="localhost",port=27017,database="reddit_stream",collection="features") bgrams = list(fts.find({"subreddit" : subreddit}))[0]["bigrams"] # bgrams = list(filter(lambda x : True if x[0])) while 1: print(bgrams) print("what do you want remove?") word1 = input("enter the first word: ") word2 = input("enter the second word: ") bgrams = list(filter(lambda x : x[0][0] != word1 and x[0][1] != word2, bgrams)) action = input("(w)rite, (q)uit, (c)ontinue") if action == "w": fts.upsert({"subreddit" : subreddit}, {"bigrams" : bgrams}) if action == "q": break
from rdt.data.mongo.features import Features import rdt.job as job, nltk, sys from nltk.corpus import stopwords if __name__ == "__main__": stopwords = stopwords.words('english') + ['-','https', '%','[', ']', "''", "``",'--', "'s", ",", ".","-","(",")",":","n't", "?","!"] ft_db=Features(host="localhost",port=27017,database="reddit_stream",collection="features") job = job.AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined") gen = job.to_words({}, remove_stopwords=True, limit=6000) finder = job.bigram_collocation_finder(gen) finder.apply_freq_filter(4) finder.apply_word_filter(lambda w: w in stopwords) bigram_measures = nltk.collocations.BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) ft_db.upsert({"subreddit" : "all"}, {"bigrams" : sorted(finder.ngram_fd.items(), key=lambda t:(-t[1], t[0])) }) # print(sorted(finder.ngram_fd.items(), key=lambda t:(-t[1], t[0]))[:10]) # print(len(finder.ngram_fd.items()))