class FeaturesDBTestCase(unittest.TestCase): def setUp(self): self.t = self.assertTrue self.inst = self.assertIsInstance self.fts_db = Features(host="localhost",port=27017,database="reddit_stream",collection="features") def tearDown(self): pass def test_convert_list_to_tuples(self): for doc in self.fts_db.find({"subreddit" : "UkrainianConflict"}, to_tuples=True,field="bigrams"): print(doc)
class AutoClassifierTestCase(unittest.TestCase): def setUp(self): self.t = self.assertTrue self.inst = self.assertIsInstance self.ft_db = Features(host='localhost',port=27017,database="reddit_stream",collection="features") self.source = AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined") def tearDown(self): pass def test_classifier(self): bgram_doc = list(self.ft_db.find({"subreddit" : "UkrainianConflict"},to_tuples=True,field="bigrams"))[0] allbgram_doc = list(self.ft_db.find({"subreddit" : "all"}, to_tuples=True, field='bigrams'))[0] pos_fts = { d[0]:True for d in bgram_doc["bigrams"] } neu_fts = { d[0]:True for d in allbgram_doc["bigrams"] } ukr = [] neu = [] for doc, fts in self.source.find_ft({"subreddit" : "UkrainianConflict"}): nomore = [] for key in fts.keys(): if key not in pos_fts: nomore = [] for n in nomore: del fts[n] if len(fts.keys()) > 0: ukr.append(fts) for doc, fts in self.source.find_ft(limit=6000): neu.append(fts) nvb = PositiveNaiveBayesClassifier.train(ukr,neu) for do, fts in self.source.find_ft(skip=6000,limit=10): print(nvb.classify(fts)) nvb.show_most_informative_features() """ukr = []
class SubredditClassifierTestCase(unittest.TestCase): def setUp(self): self.t = self.assertTrue self.inst = self.assertIsInstance self.feature = Features(host="localhost",port=27017,database="reddit_stream",collection="features") self.source = AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined") def tearDown(self): pass def test_bigram(self): bg = list(self.feature.find({"subreddit" : "UkrainianConflict"}, to_tuples=True,field="bigrams"))[0] bg = [ d[0] for d in bg["bigrams"] ] words = [d[0] for d in bg] words.extend([d[1] for d in bg]) words = list(set(words)) # print(words) # print(bg) yay = [] for doc, ft in self.source.find_ft({"subreddit" : "UkrainianConflict"},batch_size=1000): tups = ft.keys() the_words = list(set([d[0] for d in tups] + [d[1] for d in tups])) # is identifying words in the_words for word in words: ft["contains(" + word + ")"] = word in the_words to_dump = [] for key in ft.keys(): if key not in bg: to_dump.append(key) for dump in to_dump: del ft[dump] if len(ft.keys()) > 0: yay.append((ft,"UkrainianConflict")) #print() #print(bg) for doc, ft in self.source.find_ft({}, limit=6000,batch_size=1000): yay.append((ft, "Not UkrainianConflict")) random.shuffle(yay) test_set, train_set = yay[int(len(yay)/2):], yay[:int(len(yay)/2)] classifier = nltk.NaiveBayesClassifier.train(train_set) classifier.show_most_informative_features() for doc, ft in self.source.find_ft({"subreddit" : "news"}, skip=6000,batch_size=1000): if classifier.classify(ft) == "UkrainianConflict": print("YAY", doc)
from rdt.data.mongo.features import Features import sys if __name__ == "__main__": subreddit = sys.argv[1] fts = Features(host="localhost",port=27017,database="reddit_stream",collection="features") bgrams = list(fts.find({"subreddit" : subreddit}))[0]["bigrams"] # bgrams = list(filter(lambda x : True if x[0])) while 1: print(bgrams) print("what do you want remove?") word1 = input("enter the first word: ") word2 = input("enter the second word: ") bgrams = list(filter(lambda x : x[0][0] != word1 and x[0][1] != word2, bgrams)) action = input("(w)rite, (q)uit, (c)ontinue") if action == "w": fts.upsert({"subreddit" : subreddit}, {"bigrams" : bgrams}) if action == "q": break