def test_cosine_similarity(self): # Assert document cosine similarity. v1 = self.model.similarity(self.model[0], self.model[1]) v2 = self.model.similarity(self.model[0], self.model[2]) v3 = self.model.similarity(self.model[0], vector.Document("cats cats")) self.assertAlmostEqual(v1, 0.20, places=2) self.assertAlmostEqual(v2, 0.00, places=2) self.assertAlmostEqual(v3, 0.45, places=2) # Assert that Model.similarity() is aware of LSA reduction. self.model.reduce(2) v1 = self.model.similarity(self.model[0], self.model[1]) v2 = self.model.similarity(self.model[0], self.model[2]) self.assertAlmostEqual(v1, 1.00, places=2) self.assertAlmostEqual(v2, 0.00, places=2) self.model.lsa = None print("pattern.vector.Model.similarity()")
def test_information_gain(self): # Assert information gain weights. # Example from http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf v = vector.Corpus([ vector.Document({"wind": 1}, type=False), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 1}, type=True), vector.Document({"wind": 1}, type=False), vector.Document({"wind": 1}, type=False) ]) self.assertAlmostEqual(v.information_gain("wind"), 0.52, places=2) print "patten.vector.Corpus.information_gain()"
def get_all_artists_all_words_to_file(): f = open(common_words_file, "w") docs = "" for dir in os.listdir(basedir): print "artist" + dir if dir != '.git': # push the name of the artist onto the exclude words list name1 = re.split('-', dir) for n in name1: rap_exclude_words.append(n) docs += get_artist_docs(dir) corpus = vec.Document(docs, exclude=rap_exclude_words, stop_words=False) for ln in corpus.keywords(top=20): f.write("%0.08f\t%s" % ln) print "%0.08f\t%s" % ln f.close()
def test_cosine_similarity(self): # Assert document cosine similarity. v1 = self.model.similarity(self.model[0], self.model[1]) v2 = self.model.similarity(self.model[0], self.model[2]) v3 = self.model.similarity(self.model[0], vector.Document("cats cats")) self.assertAlmostEqual(v1, 0.20, places=2) self.assertAlmostEqual(v2, 0.00, places=2) self.assertAlmostEqual(v3, 0.45, places=2) # Assert that Model.similarity() is aware of LSA reduction. try: import numpy self.model.reduce(2) v1 = self.model.similarity(self.model[0], self.model[1]) v2 = self.model.similarity(self.model[0], self.model[2]) self.assertAlmostEqual(v1, 1.00, places=2) self.assertAlmostEqual(v2, 0.00, places=2) self.model.lsa = None except ImportError, e: pass
def test_document_vector(self): # Assert Vector properties. # Test copy. v = vector.Document("the cat sat on the mat").vector v = v.copy() # Test properties. self.assertTrue(isinstance(v, dict)) self.assertTrue(isinstance(v, vector.Vector)) self.assertTrue(isinstance(v.id, int)) self.assertEqual(sorted(v.features), ["cat", "mat", "sat"]) self.assertEqual(v.weight, vector.TF) self.assertAlmostEqual(v.norm, 0.58, places=2) self.assertAlmostEqual(v["cat"], 0.33, places=2) self.assertAlmostEqual(v["sat"], 0.33, places=2) self.assertAlmostEqual(v["mat"], 0.33, places=2) # Test copy + update. v = v({"cat": 1, "sat": 1, "mat": 1}) self.assertEqual(sorted(v.features), ["cat", "mat", "sat"]) self.assertAlmostEqual(v["cat"], 1.00, places=2) self.assertAlmostEqual(v["sat"], 1.00, places=2) self.assertAlmostEqual(v["mat"], 1.00, places=2) print "pattern.vector.Document.vector"
def test_classifier_vector(self): # Assert Classifier._vector() (translates input from train() and classify() to a Vector). v = vector.Classifier()._vector self.assertEqual(("cat", { "cat": 0.5, "purs": 0.5 }), v(vector.Document("the cat purs", type="cat"))) self.assertEqual(("cat", { "cat": 0.5, "purs": 0.5 }), v({ "cat": 0.5, "purs": 0.5 }, type="cat")) self.assertEqual(("cat", { "cat": 0.5, "purs": 0.5 }), v(["cat", "purs"], type="cat")) self.assertEqual(("cat", { "cat": 0.5, "purs": 0.5 }), v("cat purs", type="cat")) print "pattern.Classifier._vector()"
def test_information_gain(self): # Assert information gain weights. # Example from # http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf m = vector.Model([ vector.Document({"wind": 1}, type=False), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 0}, type=True), vector.Document({"wind": 1}, type=True), vector.Document({"wind": 1}, type=False), vector.Document({"wind": 1}, type=False)], weight=None ) self.assertAlmostEqual(m.information_gain("wind"), 0.52, places=2) # Example from http://rutcor.rutgers.edu/~amai/aimath02/PAPERS/14.pdf m = vector.Model([ vector.Document({"3": 1}, type=True), vector.Document({"3": 5}, type=True), vector.Document({"3": 1}, type=False), vector.Document({"3": 7}, type=True), vector.Document({"3": 2}, type=False), vector.Document({"3": 2}, type=True), vector.Document({"3": 6}, type=False), vector.Document({"3": 4}, type=True), vector.Document({"3": 0}, type=False), vector.Document({"3": 9}, type=True)], weight=None ) self.assertAlmostEqual(m.ig("3"), 0.571, places=3) self.assertAlmostEqual(m.gr("3"), 0.195, places=3) print("patten.vector.Model.information_gain()") print("patten.vector.Model.gain_ratio()")
def lsa_apply(df): print("Building model") m = pv.Model([pv.Document(a) for a in df['abstract']], weight=pv.TFIDF) print("Returning reduction") return m.reduce(2)
def get_lsa(texts): docs = [pv.Document(a) for a in texts] model = pv.Model(docs, weight=pv.TFIDF) lsa = model.reduce(2) return lsa
def lsa_apply(df): m = pv.Model([pv.Document(a) for a in df['abstract']], weight=pv.TFIDF) return m.reduce(2)
def create_models(group): docs = [pv.Document(item, threshold=1) for item in group] return pv.Model(docs, weight=pv.TFIDF)
def get(): res = es.search(index="posts", body={ "query": { "match_all": {} }, 'size': 10000 }) # sid = res['_scroll_id'] # scroll_size = len(res['hits']['hits']) things = {} boat_docs = {} c = collections.Counter() d = collections.Counter() for item in res['hits']['hits']: raw_doc = item['_source']['source'] original_boat_name = item['_source']['boat'] # for boat in boats: # if raw_doc.find(boat) > -1: t = TextBlob(raw_doc) d = vector.Document(raw_doc, threshold=1, stopwords=False) things.setdefault(original_boat_name, collections.Counter()) things[original_boat_name].update([i[1] for i in d.keywords(top=10)]) # boat_docs.setdefault(original_boat_name, collections.Counter()) # for i in t.ngrams(3): # boat_docs[original_boat_name].update([' '.join(i)]) # if t.sentiment.polarity > 0: # print(t.sentiment) # print(raw_doc) # doc = nlp(raw_doc) # for chunk in doc.noun_chunks: # things.setdefault(original_boat_name, collections.Counter()) # things[original_boat_name].update(chunk) # for sent in doc.sents: # print(TextBlob(str(sent)).sentiment.polarity, sent) # print(raw_doc) # print('-'*40) # print(raw_doc) final = {} filterwords = ['boat', 'boats', 'sail', 'sailing', 'template'] for boat_name, ist in things.items(): print(boat_name) final.setdefault(boat_name, []) commons = ist.most_common() for i in commons: if i[0] not in filterwords and len(i[0]) > 1 and boat_name.lower().find(i[0]) == -1 and i[1] > 1 and not parse_int(i[0]): final[boat_name].append({ 'word': i[0], 'count': i[1], }) for name, common in final.items(): post_data = { 'body': { 'keywords': common, 'boat': name }, "index": "boats", 'doc_type': 'boat' } res = es.index(**post_data)