def setup(): global pages global urlalias global revurlalias global knn pages = dict() urlalias = dict() revurlalias = dict() knn = KNN() db = MySQLdb.connect(host="192.168.200.26", user="******", passwd="xxxsecretxxx", db="pla") cur = db.cursor() cur.execute("select source, alias from url_alias") for row in cur.fetchall(): urlalias[row[1]] = row[0] revurlalias[row[0]] = row[1] cur.execute("select tid, name, description, vid from taxonomy_term_data;") for row in cur.fetchall(): url = 'taxonomy/term/' + str(row[0]) pages[url] = row[1] if url in revurlalias: pages[revurlalias[url]] = row[1] url = revurlalias[url] if row[3] == 3: soup = bs4.BeautifulSoup(row[2]) the_text = re.sub(r'[\n\r]+', r' ', soup.get_text(' ')).lower() knn.train(Document(the_text, stemmer=PORTER), url) knn.train(Document(row[1].lower()), url) cur.execute( "select a.tid, c.body_value, d.title from taxonomy_term_data as a inner join field_data_field_practice_areas as b on (a.tid=b.field_practice_areas_tid and b.entity_type='node' and b.bundle != 'professionals' and b.deleted=0) inner join field_data_body as c on (b.entity_id=c.entity_id and b.entity_type=c.entity_type) inner join node as d on (c.entity_id=d.nid);" ) for row in cur.fetchall(): url = 'taxonomy/term/' + str(row[0]) if url in revurlalias: url = revurlalias[url] soup = bs4.BeautifulSoup(row[1]) the_text = re.sub(r'[\n\r]+', r' ', soup.get_text(' ')).lower() knn.train(Document(the_text, stemmer=PORTER), url) knn.train(Document(row[2].lower()), url) cur.execute("select nid, title from node where status=1;") for row in cur.fetchall(): url = 'node/' + str(row[0]) pages[url] = row[1] if url in revurlalias: pages[revurlalias[url]] = row[1] db.close() pgcur = conn.cursor() pgcur.execute( "select query, target from website_queries where target is not null group by query, target" ) for row in pgcur.fetchall(): words = re.split(r'[\n\r,;]+ *', row[1]) for word in words: print("training on " + row[0].lower() + " for " + word) knn.train(Document(row[0].lower()), word) conn.commit() pgcur.close()
def train(cls, train_file, model_file): sents_dic = (json.loads(jsonl) for jsonl in SoftSkills.load(train_file)) model = KNN() for sent in sents_dic: text = sent['text'] v = count([word for word, pos in tag(text)]) # {'sweet': 1} if v: model.train(v, type=sent['soft skill']) model.save(model_file) return model
s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: m.append(Document(s, type=p, stemmer=None)) # Train k-Nearest Neighbor on the model. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN(baseline=None) # By default, baseline=MAJORITY for document in m: # (classify unknown documents with the most frequent type). classifier.train(document) # These are the adjectives the classifier has learned: print sorted(classifier.features) print # We can now ask it to classify documents containing these words. # Note that you may get different results than the ones below, # since you will be mining other (more recent) tweets. # Again, a robust classifier needs lots and lots of training data. # If None is returned, the word was not recognized, # and the classifier returned the default value (see above). print classifier.classify('sweet potato burger') # yields 'WIN' print classifier.classify('stupid autocorrect') # yields 'FAIL' # "What can I do with it?"
parse(s) ) #parse anlayzes & gives strings that are annotated with specified tags s = search('JJ', s) #searches for adjectives in tweets (JJ = adjectiive) s = [match[0].string for match in s] s = ' '.join(s) if len(s) > 0: corpus.append(Document(s, type=p)) corpus.append(Document(s, type=m)) classifier = KNN() #k-nearest neighbor classifier = K-NN objects = [] for document in corpus: #documents are an unordered bag of given sentences. classifier.train( document) #adjective vectors in corpus trains the classifier objects.append(classifier.classify('awesome')) #predicts awesome as win objects.append(classifier.classify('cool')) #predicts cool as win objects.append(classifier.classify('damn')) #predicts damn as fail objects.append(classifier.classify('sucks')) #predicts sucks as fail print objects wincounter = 0 failcounter = 0 for thing in objects: if thing == 'WIN': wincounter += 1 elif thing == 'FAIL': failcounter += 1 else: pass
s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: m.append(Document(s, type=p, stemmer=None)) # Train k-Nearest Neighbor on the model. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN(baseline=None) # By default, baseline=MAJORITY # (classify unknown documents with the most frequent type). for document in m: classifier.train(document) # These are the adjectives the classifier has learned: print(sorted(classifier.features)) print() # We can now ask it to classify documents containing these words. # Note that you may get different results than the ones below, # since you will be mining other (more recent) tweets. # Again, a robust classifier needs lots and lots of training data. # If None is returned, the word was not recognized, # and the classifier returned the default value (see above). print(classifier.classify('sweet potato burger')) # yields 'WIN' print(classifier.classify('stupid autocorrect')) # yields 'FAIL' # "What can I do with it?"