def test_tag(self): trained_tagger = PerceptronTagger() tokens = trained_tagger.tag(self.text) assert_equal([w for w, t in tokens], [ 'Simple', 'is', 'better', 'than', 'complex', '.', 'Complex', 'is', 'better', 'than', 'complicated', '.' ])
def main(args): f = open(args.filename) D = {} tag_set = set([]) tb = Blobber(pos_tagger=PerceptronTagger()) for i, line in enumerate(f): b1 = tb(line) for w, t in b1.tags: tag_set.add(t) if w not in D: D[w] = Counter() D[w][t] = float(D[w][t] + 1) sorted_pos_tags = sorted(list(tag_set)) rows = [] for w in D.keys(): row = [w] pos_counts_word = np.array([float(D[w][t]) for t in sorted_pos_tags]) pos_dist_word = pos_counts_word / float(np.sum(pos_counts_word)) assert (np.isclose(np.sum(pos_dist_word), 1.0)) row = row + list(pos_dist_word) rows.append(row) header = ['word'] + sorted_pos_tags print("Set of POS tags in sorted order", header) df = pd.DataFrame().from_records(rows, columns=header) print("Dumping the POS distribution.") df.to_csv(args.outputfile, index=None, encoding='utf-8')
def ExtendText(fileName, tagger=PerceptronTagger()): with io.open(fileName, 'r') as w: text = TextBlob(w.read(), pos_tagger=tagger) extended_text = [] for sent in text.sentences: for word in sent.pos_tags: #word = "bank" penn_tags = ['JJ', 'NN', 'V'] extending = False for tag in penn_tags: if tag in word[1]: extending = True pos = tag[0].lower() try: l = lesk(sent.string, word[0].lower(), pos) syns = l._lemma_names for syn in syns: extended_text.append(syn) break except: extended_text.append(word[0].lower()) if not extending: extended_text.append(word[0].lower()) extended_text = ' '.join([ word for word in extended_text if word not in cachedStopWords ]).lstrip() return extended_text
class TestPerceptronTagger(unittest.TestCase): def setUp(self): self.text = ("Simple is better than complex. " "Complex is better than complicated.") self.tagger = PerceptronTagger(load=False) def test_init(self): tagger = PerceptronTagger(load=False) assert_true(isinstance(tagger, BaseTagger)) def test_train(self): sentences = _read_tagged(_wsj_train) nr_iter = 5 self.tagger.train(sentences, nr_iter=nr_iter) nr_words = sum(len(words) for words, tags in sentences) # Check that the model has 'ticked over' once per instance assert_equal(nr_words * nr_iter, self.tagger.model.i) # Check that the tagger has a class for every seen tag tag_set = set() for _, tags in sentences: tag_set.update(tags) assert_equal(len(tag_set), len(self.tagger.model.classes)) for tag in tag_set: assert_true(tag in self.tagger.model.classes) @attr("slow") def test_tag(self): trained_tagger = PerceptronTagger() tokens = trained_tagger.tag(self.text) assert_equal([w for w, t in tokens], [ 'Simple', 'is', 'better', 'than', 'complex', '.', 'Complex', 'is', 'better', 'than', 'complicated', '.' ]) @attr("slow") def test_tag_textblob(self): trained_tagger = PerceptronTagger() blob = TextBlob(self.text, pos_tagger=trained_tagger) # Punctuation is excluded assert_equal([w for w, t in blob.tags], [ 'Simple', 'is', 'better', 'than', 'complex', 'Complex', 'is', 'better', 'than', 'complicated' ]) def test_loading_missing_file_raises_missing_corpus_exception(self): tagger = PerceptronTagger(load=False) assert_raises(MissingCorpusError, tagger.load, 'missing.pickle')
class TestPerceptronTagger(unittest.TestCase): def setUp(self): self.text = ("Simple is better than complex. " "Complex is better than complicated.") self.tagger = PerceptronTagger(load=False) def test_init(self): tagger = PerceptronTagger(load=False) assert_true(isinstance(tagger, BaseTagger)) def test_train(self): sentences = _read_tagged(_wsj_train) nr_iter = 5 self.tagger.train(sentences, nr_iter=nr_iter) nr_words = sum(len(words) for words, tags in sentences) # Check that the model has 'ticked over' once per instance assert_equal(nr_words * nr_iter, self.tagger.model.i) # Check that the tagger has a class for every seen tag tag_set = set() for _, tags in sentences: tag_set.update(tags) assert_equal(len(tag_set), len(self.tagger.model.classes)) for tag in tag_set: assert_true(tag in self.tagger.model.classes) @attr("slow") def test_tag(self): trained_tagger = PerceptronTagger() tokens = trained_tagger.tag(self.text) assert_equal([w for w, t in tokens], ['Simple', 'is', 'better', 'than', 'complex', '.', 'Complex', 'is', 'better', 'than', 'complicated', '.']) @attr("slow") def test_tag_textblob(self): trained_tagger = PerceptronTagger() blob = TextBlob(self.text, pos_tagger=trained_tagger) # Punctuation is excluded assert_equal([w for w, t in blob.tags], ['Simple', 'is', 'better', 'than', 'complex', 'Complex', 'is', 'better', 'than', 'complicated']) def test_loading_missing_file_raises_missing_corpus_exception(self): tagger = PerceptronTagger(load=False) assert_raises(MissingCorpusException, tagger.load, 'missing.pickle')
def test_tag_textblob(self): trained_tagger = PerceptronTagger() blob = TextBlob(self.text, pos_tagger=trained_tagger) # Punctuation is excluded assert_equal([w for w, t in blob.tags], [ 'Simple', 'is', 'better', 'than', 'complex', 'Complex', 'is', 'better', 'than', 'complicated' ])
def pronounSub(string): tokens = TextBlob(string, pos_tagger=PerceptronTagger()) for word, speech in tokens.tags: sub = "" if speech == "PRP": sub == "I" if speech == "PRP$": sub == "My" string = re.sub(speech, sub, string, re.I) return string
def get_perceptron_tagger(self): """ Perform preprocessing (shallow parsing) by state-of-the-art PerceptronTagger (98.8% accuracy) However, it stripped punctuations to avoid ambiguity http://stevenloria.com/tutorial-state-of-the-art-part-of-speech-tagging-in-textblob/ """ from textblob import Blobber from textblob_aptagger import PerceptronTagger tb = Blobber(pos_tagger=PerceptronTagger()) return tb
def __init__(self, winSize, lex = False): self.winSize = winSize self.lex = lex self.lexFilename = "Lex" if lex else "" self.winMod = WindowProb("C:/MissingWord/post"+self.lexFilename+"ModComp"+str(self.winSize)+".pickle", compressed = True) self.winOrig = WindowProb("C:/MissingWord/post"+self.lexFilename+"Comp"+str(self.winSize)+".pickle", compressed = True) with open("toLexicalize.pickle", "rb") as f: self.toLexicalize = pickle.load(f) self.aptagger = PerceptronTagger()
def posDist(text): text = " ".join(text) tokens = Tokenize.byWord(text) POStags = [tag for word, tag in TaggingTools.tagPOS(text)] possibleTags = PerceptronTagger().model.classes vector = {} total = 0 for tag in possibleTags: vector[tag] = 0 for tag in POStags: vector[tag] += 1 total += 1 for tag in possibleTags: vector[tag] = int(100 * vector[tag] / total) return vector
def cal_sentiment(tweet_set): for tweet in tweet_set: tb = TextBlob(tweet['text'].lower(), pos_tagger=PerceptronTagger()) nb = TextBlob(tweet['text'], analyzer=NaiveBayesAnalyzer()) pol_tb = str(tb.polarity) sub_tb = str(tb.subjectivity) pol_nb = str(nb.polarity) sub_nb = str(nb.subjectivity) sent_p_nb = str(nb.sentiment[1]) sent_n_nb = str(nb.sentiment[2]) sent_c_nb = str(nb.sentiment[0]) sent = [ pol_tb, sub_tb, pol_nb, sub_nb, sent_p_nb, sent_n_nb, sent_c_nb ] return sent
def relevance(file): hvs = GetHighVectorLogSums('high') percepticon = PerceptronTagger() cat_dict = defaultdict(int) files = fio.recGetTextFiles('training') file_sum = 0 extended_text = ExtendText(file, percepticon) word_count = 0 with open(file, 'r') as f: word_count = len(f.read().split()) for term in extended_text.split(): if term in hvs.keys(): file_sum += hvs[term] file_sum = file_sum / len(extended_text.split()) return file_sum
def GetVectors(): essay_path = 'training' files = fio.recGetTextFiles(path.abspath(essay_path)) docs = [] percepticon = PerceptronTagger() cat_dict = defaultdict(int) for f in files: extended_text = ExtendText(f, percepticon) name = '' cats = ['high', 'medium', 'low'] for cat in cats: if cat in f: name = cat + str(cat_dict[cat]) cat_dict[cat] += 1 docs.append(Document(extended_text, name=name, top=None)) m = Model(docs) #lsa = m.reduce(5) return m
def __init__(self, winSize=5, lex=False, compFile=False): self.lex = lex self.lexFilename = "lex" if lex else "" comp = "Comp" if lex else "comp" if not compFile: comp = "" self.winSize = winSize self.winOrig = WindowProb("C:/MissingWord/" + self.lexFilename + comp + str(self.winSize) + ".pickle", compressed=compFile) with open("lexTags.pickle", "rb") as f: self.allLexTags = pickle.load( f ) #inefficient to use a lexicalized set, but it will still work for unlexicalized models with open("toLexicalize.pickle", "rb") as f: self.toLexicalize = pickle.load(f) self.tagger = PerceptronTagger()
def main(args): lines = ioutils.load_word_list(args.filename) # f = open(args.filename) D = {} tag_set = set([]) tb = Blobber(pos_tagger=PerceptronTagger()) for i, line in enumerate(lines): b1 = tb(line) for w, t in b1.tags: tag_set.add(t) if w not in D: D[w] = Counter() D[w][t] = float(D[w][t] + 1) # print D['fawn'].most_common(1)[0] # print D['yellow'].most_common(1)[0] sorted_pos_tags = sorted(list(tag_set)) rows = [] most_common_rows = [] for w in D.keys(): row = [w] pos_counts_word = np.array([float(D[w][t]) for t in sorted_pos_tags]) pos_dist_word = pos_counts_word / float(np.sum(pos_counts_word)) assert (np.isclose(np.sum(pos_dist_word), 1.0)) row = row + list(pos_dist_word) rows.append(row) most_common_rows.append([ w, np.max(pos_counts_word), sorted_pos_tags[np.argmax(pos_counts_word)] ]) header = ['word'] + sorted_pos_tags print "Set of POS tags in sorted order", header df = pd.DataFrame().from_records(rows, columns=header) print "Dumping the POS distribution." df.to_csv(args.outputfile + ".csv", index=None, encoding='utf-8') print "Dumping most common pos tag" df2 = pd.DataFrame().from_records(most_common_rows, columns=['word', 'count', 'POS']) df2.to_csv(args.outputfile + "_pos.csv", index=None, encoding='utf-8')
def __init__(self): # create custom components self.naive_bayes_analyzer = NaiveBayesAnalyzer() self.conll_extractor = ConllExtractor() self.nltk_tagger = NLTKTagger() self.perceptron_tagger = PerceptronTagger() if DEV_ENV: return # train all components (default and custom) text = 'TextBlob blobs great!' default_blob = TextBlob(text) default_blob.sentiment default_blob.noun_phrases default_blob.pos_tags custom_blob = TextBlob(text, analyzer=self.naive_bayes_analyzer, np_extractor=self.conll_extractor, pos_tagger=self.nltk_tagger) custom_blob.sentiment custom_blob.noun_phrases custom_blob.pos_tags custom2_blob = TextBlob(text, pos_tagger=self.perceptron_tagger) custom2_blob.pos_tags
def parse_doc(json_iter, pos_tagger=None, force_encode=False): """parse one document to prep for TextRank""" global DEBUG, POS_TAGGER # set up the PoS tagger, defaults to PerceptronTagger from TextBlob if not pos_tagger: if not POS_TAGGER: POS_TAGGER = PerceptronTagger() pos_tagger = POS_TAGGER for meta in json_iter: base_idx = 0 for graf_text in filter_quotes(meta["text"], is_email=False): if DEBUG: print("graf_text:", graf_text) grafs, new_base_idx = parse_graf(meta["id"], graf_text, base_idx, pos_tagger, force_encode) base_idx = new_base_idx for graf in grafs: yield graf
def __init__(self): self.lex = self.load_lexicon() self.blobber = Blobber(pos_tagger=PerceptronTagger())
def tagPOS(text): blob = TextBlob(text, pos_tagger=PerceptronTagger()) return blob.tags
def GetHighVectorLogSums(label): m = GetVectors() high_tf_sums = defaultdict(float) for corpuscle in m._documents: if label in corpuscle._name: for tf in corpuscle.vector: high_tf_sums[tf] -= math.log(corpuscle.vector[tf]) return high_tf_sums if __name__ == '__main__': hvs = GetHighVectorLogSums('high') percepticon = PerceptronTagger() cat_dict = defaultdict(int) files = fio.recGetTextFiles(r'C:\Users\William\Desktop\421_Final\training') file_sums = [] for file in files: file_sum = 0 extended_text = ExtendText(file, percepticon) word_count = 0 with open(file, 'r') as f: word_count = len(f.read().split()) for term in extended_text.split(): #learn below weights through experimentation if term in hvs.keys(): file_sum += hvs[term] file_sums.append(file_sum / len(extended_text.split())) print('mean' + str(mean(file_sums)))
import pandas as pd from textblob import TextBlob from textblob_aptagger import PerceptronTagger NEED_POS = [ 'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN' ] def remove_extra_tags(tags_list): return_tags_list = [] for t in tags_list: if t[1] in NEED_POS: return_tags_list.append(t) return return_tags_list if __name__ == '__main__': # file_name = '../../testReviews.csv' file_name = '../../MProductReviewsLatest.csv' reviews = pd.read_csv(file_name) reviews['postagged_body'] = reviews['Body'].map( lambda x: TextBlob(x, pos_tagger=PerceptronTagger()).tags) reviews['postagged_body'] = reviews['postagged_body'].map( lambda x: remove_extra_tags(x)) reviews.to_csv('../../MProductReviewsLatestPOStagged.csv', sep='\t')
poly = fiona.open('D:\data\open_toronto\NEIGHBORHOODS_WGS84.shp') # for rex in poly: p = geometry.shape(rex['geometry']) xy = zip(p.boundary.xy[1], p.boundary.xy[0]) users = toronto_tweets.find({ 'geo.coordinates': { "$near": xy[4] } }).distinct('user.id') for user_id in users: for tweet in toronto_tweets.find({'user.id': user_id}): if not tweet.has_key('pol_tb'): tb = TextBlob(tweet['text'].lower(), pos_tagger=PerceptronTagger()) nb = TextBlob(tweet['text'], analyzer=NaiveBayesAnalyzer()) pol_tb = tb.polarity sub_tb = tb.subjectivity pol_nb = nb.polarity sub_nb = nb.subjectivity sent_p_nb = nb.sentiment[1] sent_n_nb = nb.sentiment[2] sent_c_nb = nb.sentiment[0] tweet['pol_tb'] = pol_tb tweet['sub_tb'] = sub_tb tweet['sent_nltk'] = sent_c_nb tweet['sen_pos_n'] = sent_p_nb tweet['sen_neg_n'] = sent_n_nb tweet['pol_nltk'] = pol_nb tweet['sub_nltk'] = sub_nb
import pickle from collections import Counter import generateTagWindows from textblob import TextBlob from textblob_aptagger import PerceptronTagger import math from WindowProb import WindowProb import statistics import numpy import lexicalizedTagWindows WIN_SIZE = 5 WIN_OFFSET = int((WIN_SIZE - 1) / 2) aptagger = PerceptronTagger() def removeWord(tokens): removed = [] for i in range(1, len(tokens) - 1): removed.append([token for ind, token in enumerate(tokens) if ind != i]) return removed def modLikelihood(allLexTags, modProb4, windowProb5, tags, tagIndex): probs = [] for alt in allLexTags: win4 = generateTagWindows.makeWindow(tags, begin=tagIndex - 2, end=tagIndex + 2)
def benchmaking(self): test = [[(u'Pierre', u'NNP'), (u'Vinken', u'NNP'), (u',', u','), (u'61', u'CD'), (u'years', u'NNS'), (u'old', u'JJ'), (u',', u','), (u'will', u'MD'), (u'join', u'VB'), (u'the', u'DT'), (u'board', u'NN'), (u'as', u'IN'), (u'a', u'DT'), (u'nonexecutive', u'JJ'), (u'director', u'NN'), (u'Nov.', u'NNP'), (u'29', u'CD'), (u'.', u'.')], [(u'Mr.', u'NNP'), (u'Vinken', u'NNP'), (u'is', u'VBZ'), (u'chairman', u'NN'), (u'of', u'IN'), (u'Elsevier', u'NNP'), (u'N.V.', u'NNP'), (u',', u','), (u'the', u'DT'), (u'Dutch', u'NNP'), (u'publishing', u'VBG'), (u'group', u'NN'), (u'.', u'.'), (u'Rudolph', u'NNP'), (u'Agnew', u'NNP'), (u',', u','), (u'55', u'CD'), (u'years', u'NNS'), (u'old', u'JJ'), (u'and', u'CC'), (u'former', u'JJ'), (u'chairman', u'NN'), (u'of', u'IN'), (u'Consolidated', u'NNP'), (u'Gold', u'NNP'), (u'Fields', u'NNP'), (u'PLC', u'NNP'), (u',', u','), (u'was', u'VBD'), (u'named', u'VBN'), (u'a', u'DT'), (u'nonexecutive', u'JJ'), (u'director', u'NN'), (u'of', u'IN'), (u'this', u'DT'), (u'British', u'JJ'), (u'industrial', u'JJ'), (u'conglomerate', u'NN'), (u'.', u'.')], [(u'A', u'DT'), (u'form', u'NN'), (u'of', u'IN'), (u'asbestos', u'NN'), (u'once', u'RB'), (u'used', u'VBN'), (u'to', u'TO'), (u'make', u'VB'), (u'Kent', u'NNP'), (u'cigarette', u'NN'), (u'filters', u'NNS'), (u'has', u'VBZ'), (u'caused', u'VBN'), (u'a', u'DT'), (u'high', u'JJ'), (u'percentage', u'NN'), (u'of', u'IN'), (u'cancer', u'NN'), (u'deaths', u'NNS'), (u'among', u'IN'), (u'a', u'DT'), (u'group', u'NN'), (u'of', u'IN'), (u'workers', u'NNS'), (u'exposed', u'VBN'), (u'to', u'TO'), (u'it', u'PRP'), (u'more', u'RBR'), (u'than', u'IN'), (u'30', u'CD'), (u'years', u'NNS'), (u'ago', u'IN'), (u',', u','), (u'researchers', u'NNS'), (u'reported', u'VBD'), (u'.', u'.')]] """ [(u'A', u'DT'), (u'forge', u'NN'), (u'is', u'VBZ'), (u'a', u'DT'), (u'type', u'NN'), (u'of', u'IN'), (u'hearth', u'JJ'), (u'used', u'VBN'), (u'for', u'IN'), (u'heating', u'NN'), (u'metals', u'NNS'), (u'.', u'.'), (u'or', u'CC'), (u'the', u'DT'), (u'workplace', u'NN'), (u'(', u'('), (u'smithy', u'JJ'), (u')', u')'), (u'where', u'WRB'), (u'such', u'JJ'), (u'a', u'DT'), (u'hearth', u'JJ'), (u'is', u'VBZ'), (u'located', u'VBN'), (u'.', u'.')]""" from textblob_aptagger import PerceptronTagger import nltk print("perceptron tagger accuracy based on conll2000: ",self.pos_accuracy([nltk.corpus.conll2000.tagged_words()[:30], nltk.corpus.conll2000.tagged_words()[30:60], nltk.corpus.conll2000.tagged_words()[60:90], nltk.corpus.conll2000.tagged_words()[90:120], nltk.corpus.conll2000.tagged_words()[120:150], nltk.corpus.conll2000.tagged_words()[300:330]], PerceptronTagger())) print("NLTK pos tagger accuracy based on conll2000: ", self.pos_accuracy([nltk.corpus.conll2000.tagged_words()[:30], nltk.corpus.conll2000.tagged_words()[30:60], nltk.corpus.conll2000.tagged_words()[60:90], nltk.corpus.conll2000.tagged_words()[90:120], nltk.corpus.conll2000.tagged_words()[120:150], nltk.corpus.conll2000.tagged_words()[300:330]], self.nltk_pos_tag())) '''print("NLTK pos tagger accuracy based on brown corpus: ", self.pos_accuracy([nltk.corpus.brown.tagged_words()[:30], nltk.corpus.brown.tagged_words()[30:60], nltk.corpus.brown.tagged_words()[60:90], nltk.corpus.brown.tagged_words()[90:120], nltk.corpus.brown.tagged_words()[120:150], nltk.corpus.brown.tagged_words()[300:330]], self.nltk_pos_tag()))''' print("perceptron tagger accuracy based on test data: ",self.pos_accuracy(test, PerceptronTagger())) print("NLTK pos tagger accuracy based on test data: ", self.pos_accuracy(test, self.nltk_pos_tag())) '''print("NLTK pos tagger accuracy based on brown corpus: ", self.pos_accuracy([nltk.corpus.brown.tagged_words()[:30],
## buildtree.py ## Author: Yangfeng Ji ## Date: 09-10-2014 ## Time-stamp: <yangfeng 09/29/2014 15:15:23> from datastructure import * from textblob import TextBlob from textblob_aptagger import PerceptronTagger from util import extractrelation from maltparser import get_head_words perceptron_tagger = PerceptronTagger() def BFT(tree): """ Breadth-first treavsal on general RST tree :type tree: SpanNode instance :param tree: an general RST tree """ queue = [tree] bft_nodelist = [] while queue: node = queue.pop(0) bft_nodelist.append(node) queue += node.nodelist return bft_nodelist def BFTbin(tree): """ Breadth-first treavsal on binary RST tree
def process_twitter(actionability_ranking: pd.DataFrame): """Filter actionable items. :param data: :return: """ ### LOAD POS TAGGER TO HELP WITH ACTIONABILITY SCORING tb = Blobber(pos_tagger=PerceptronTagger()) ### CREATE EMPTY LIST FOR SCORES FROM TEXT ANALYSIS language_scores = [] ### WE DONT WANT PAST TENSE VERBS AND ADVERBS INDICATE NEWS bad_verbs = ['VBZ', 'VBN', 'VBD', 'RB'] good_verbs = ['VB', 'VBG', 'VBP', 'JJ'] # Part of speech tag each tweet for tweet in actionability_ranking['tweet'].tolist(): tagged = tb(tweet.lower()) tag_list = [x[1] for x in tagged.tags] score = 0 # Penalize tweets with structures that are known to be now what we're looking for # POS Tags available here: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html if tag_list[:2] == [ 'NNP', 'POS', ]: score -= 4 if tag_list[:3] == ['NN', 'POS', 'NN']: score -= 10 if tag_list[:3] == ['NN', 'NN', 'TO']: score -= 10 if tag_list[:3] == ['NN', 'TO', 'VB']: score -= 10 if tag_list[:3] == ['NN', 'NN', 'VBZ']: score -= 10 if tag_list[:3] == ['NN', 'NN', 'JJ']: score -= 10 if tag_list[:3] == ['NN', 'JJ', 'NN']: score -= 10 if tag_list[:3] == ['JJ', 'NNS', 'VBP']: score -= 4 elif tag_list[:3] == ['JJ', 'NN', 'VBG']: score -= 4 elif tag_list[:3] == ['JJS', 'NN', 'JJ']: score -= 4 elif tag_list[:3] == ['NN', 'NN', 'NN']: score -= 4 elif tag_list[:3] == ['JJ', 'NN', 'NN']: score -= 4 elif tag_list[:3] == ['JJ', 'NN', 'NNS']: score -= 4 elif tag_list[:3] == ['JJ', 'NN', 'VB']: score -= 10 elif tag_list[:3] == ['IN', 'JJ', 'NN']: score -= 10 elif tag_list[0] == 'JJ': score += 3 if tag_list[0] == 'VB': score += 4 # Penalize tweets with words in them that indicate they are not useful if 'yesterday' in tweet.lower(): score -= 10 if 'last week' in tweet.lower(): score -= 10 if 'video' in tweet.lower(): score -= 10 if 'news search' in tweet.lower(): score -= 10 if 'town hall in facebook' in tweet.lower(): score -= 20 if 'townhall in facebook' in tweet.lower(): score -= 20 if "facebook's" in tweet.lower() and "town hall" in tweet.lower(): score -= 20 if 'cnn town hall' in tweet.lower(): score -= 20 if 'online paper' in tweet.lower(): score -= 20 if 'join us' in tweet.lower(): score -= 10 if 'dreamhome' in tweet.lower(): score -= 20 if 'sales' in tweet.lower(): score -= 20 if 'relaxing' in tweet.lower(): score -= 20 if 'ice ice baby' in tweet.lower(): score -= 20 if 'interior design' in tweet.lower(): score -= 20 if 'canada' in tweet.lower(): score -= 20 if 'toyota' in tweet.lower(): score -= 20 if 'minister' in tweet.lower(): score -= 20 if 'uk' in tweet.lower(): score -= 20 if 'MP' in tweet: score -= 20 if 'eu' in tweet.lower(): score -= 20 if 'england' in tweet.lower(): score -= 20 if 'furniture' in tweet.lower(): score -= 20 if 'kitchen' in tweet.lower(): score -= 20 if 'germany' in tweet.lower(): score -= 20 if 'south africa' in tweet.lower(): score -= 20 if 'News' in tweet: score -= 10 if 'daily beast' in tweet.lower(): score -= 20 if '#design' in tweet.lower(): score -= 20 if '#interior' in tweet.lower(): score -= 20 if 'radicalisation' in tweet.lower(): score -= 20 if 'militancy' in tweet.lower(): score -= 20 if 'sharia' in tweet.lower(): score -= 20 if 'uncontrolled' in tweet.lower(): score -= 20 if 'enjoy a' in tweet.lower(): score -= 20 if tweet.lower().startswith('the latest'): score -= 20 if 'wapo' in tweet.lower(): score -= 20 if 'nytimes' in tweet.lower(): score -= 20 if '5 things' in tweet.lower(): score -= 20 if 'poland' in tweet.lower(): score -= 20 if 'hungary' in tweet.lower(): score -= 20 if 'slovakia' in tweet.lower(): score -= 20 if 'czech' in tweet.lower(): score -= 20 if 'egypt' in tweet.lower(): score -= 20 if 'austria' in tweet.lower(): score -= 20 if 'germany' in tweet.lower(): score -= 20 if 'hiring' in tweet.lower(): score -= 20 if 'ice show' in tweet.lower(): score -= 20 if 'ice cream' in tweet.lower(): score -= 20 if 'secure border' in tweet.lower(): score -= 20 if 'hire thousands' in tweet.lower(): score -= 20 if 'demand congress hire' in tweet.lower(): score -= 20 if 'illegals' in tweet.lower(): score -= 20 if 'icecream' in tweet.lower(): score -= 20 if 'avalanche' in tweet.lower(): score -= 20 if 'ice cold' in tweet.lower(): score -= 20 if 'snow' in tweet.lower(): score -= 20 if 'alex jones' in tweet.lower(): score -= 1000 if 'viral' in tweet.lower(): score -= 20 if 'scotland' in tweet.lower(): score -= 20 if 'brexit' in tweet.lower(): score -= 20 if 'caretoclick' in tweet.lower(): score -= 10 if 'ministry' in tweet.lower(): score -= 20 if 'ugh' in tweet.lower(): score -= 30 if 'london' in tweet.lower(): score -= 30 if 'wales' in tweet.lower(): score -= 30 if '@youtube' in tweet.lower(): score -= 20 if 'breitbart' in tweet.lower(): score -= 20 if 'hypocrite' in tweet.lower(): score -= 20 if 'moron' in tweet.lower(): score -= 20 if '!!!' in tweet.lower(): score -= 10 if 'Take Action: Sign Petition' in tweet: score -= 50 if 'petition' in tweet.lower(): score -= 15 if 'signandshareorg' in tweet.lower(): score -= 10 if re.findall(profanity_regex, tweet.lower()): score -= 50 # Penalize tweets with tons of hashtags if tweet.count('#') > 2: score -= (tweet.count('#') - 2) * 5 # Penalize tweets with tons of mentions if tweet.count('@') > 2: score -= (tweet.count('@') - 2) * 5 # Reward tweets with good verbs and no bad verbs # PENALIZE TWEETS WITH BAD VERBS AND FEW GOOD VERBS verb_score = 0 for tag in tag_list: if tag in bad_verbs: verb_score -= 2 if tag in good_verbs: verb_score += 1 score += int(verb_score * 1.0 / len(tag_list)) * 4 # Reward polite tweets that encourage action if tweet.lower().startswith('please'): score += 10 # Other score adjustments # Reward tweets that are longer score += int(len(tweet) / 60) * 4 language_scores.append(score) ### ADD LANGUAGE SCORE TO PANDAS DF actionability_ranking['pos_score'] = np.asarray(language_scores) ### ADD ACTIONABILITY SCORE TO PANDAS DF BASED ON FIELDS WE EXTRACTED actionability_ranking['actionability_score'] = ( np.where(actionability_ranking['tweet_cities'] == '', 0, 15) + np.where(actionability_ranking['tweet_states'] == '', 0, 15) + np.where(actionability_ranking['tweet_urls'] == '', 0, 5) + np.where(actionability_ranking['tweet_phone_numbers'] == '', 0, 20) + np.where(actionability_ranking['tweet_dates_ref'] == '', 0, 10) + np.where(actionability_ranking['tweet_legislator_names'] == '', 0, 15) + np.where(actionability_ranking['tweet_legislator_handles'] == '', 0, 15) + np.where(actionability_ranking['tweet'].str.startswith('@'), -10, 0) + np.where(actionability_ranking['tweet'].str.startswith('.@'), -10, 0)) ### CALCULATE THE TOTAL SCORE actionability_ranking['total_score'] = ( actionability_ranking['es_score'] + actionability_ranking['actionability_score'] + actionability_ranking['pos_score']) ### FILTER THE DF BY TOTAL SCORE AND ELASTIC SEARCH RELEVANCE filtered_data = actionability_ranking.loc[ (actionability_ranking['total_score'] > 8.5) & (actionability_ranking['es_score'] > 7.0)] filtered_tweet_list = filtered_data['tweet'].tolist() filtered_score_list = filtered_data['total_score'].tolist() filtered_es_score_list = filtered_data['es_score'].tolist() ### DE-DUPLICATE USING EDIT-DISTANCE < 60 EDITS AS A FILTER ### THIS PART TAKES A WHILE: TWEETS^2 COMPARISONS. THAT'S WHY WE FILTER FIRST. distance_dict = {} for i, tweet in enumerate(filtered_tweet_list): for j, tweet2 in enumerate(filtered_tweet_list): tweet_ids = tuple(sorted([i, j])) if i == j: pass else: distance = editdistance.eval(tweet, tweet2) if distance <= 60: distance_dict[tweet_ids] = distance ### FOR DUPLICATES, WE'LL TAKE THE MORE ACTIONABLE/RELEVANT OF THE TWO delete_indices = [] for (i, j), v in distance_dict.items(): if filtered_score_list[i] >= filtered_score_list[j]: delete_indices.append(j) else: delete_indices.append(i) delete_indices = list(set(delete_indices)) ### WE DELETE ROWS WITH THE INDICES THAT WERE FOUND TO BE DUPLICATED AND LESS ACTIONABLE filtered_data.drop(filtered_data.index[delete_indices], inplace=True, errors='ignore') ### REMOVE SOME OF THE UN-NEEDED COLUMNS BEFORE PUSHING INTO DRUPAL final_data = filtered_data[[ u'issue', u'action', u'id', u'es_score', u'total_score', u'tweet', u'tweet_timestamp', u'query_timestamp', u'tweet_user', u'tweet_cities', u'tweet_states', u'tweet_urls', u'tweet_phone_numbers', u'tweet_dates_ref', u'tweet_legislator_names', u'tweet_legislator_handles' ]] #final_data.to_csv('/Users/brosskatz/PycharmProjects/rzst/w210_imwithdata/imwithdata/data/static_data/final_data_example.csv') return final_data
def get_textblob_tags(sentence): blob = TextBlob(sentence, pos_tagger=PerceptronTagger()) return blob.tags
from functools import partial import nltk from knx.text.postagger.base import map_paren, reverse_map_paren from BS.knx.text.tokenizer import default_tokenizer as tokenizer try: from textblob_aptagger import PerceptronTagger perceptron_tagger = PerceptronTagger() SYMBOLS = {'@', '#', '%', '^', '*', '+', '=', '~'} # Replace the original tag method to support tokenized text def _tag(self, corpus, tokenize=True): """Tags a string `corpus`.""" # Assume untokenized corpus has \n between sentences and ' ' between words s_split = nltk.sent_tokenize if tokenize else lambda text: [text] w_split = tokenizer.tokenize if tokenize else lambda sent: sent def split_sents(corpus): for s in s_split(corpus): yield map(map_paren, w_split(s)) prev, prev2 = self.START has_open_left_single_quote = False tokens = [] for words in split_sents(corpus): context = self.START + [self._normalize(w)
def test_tag(self): trained_tagger = PerceptronTagger() tokens = trained_tagger.tag(self.text) assert_equal([w for w, t in tokens], ['Simple', 'is', 'better', 'than', 'complex', '.', 'Complex', 'is', 'better', 'than', 'complicated', '.'])
from os import system, getcwd from tweepy import OAuthHandler, API from csv import reader from codecs import iterdecode import pickle from string import punctuation from tweet_tk.retweet_fetcher import retweet_cnt from tweet_tk.bots import add_suspects, is_suspect from tweet_tk.emoticons_parser import emoticons_score from tweet_tk.tweet_sentiment import sentiment from tweet_tk.tweets_to_df import tweet_json_to_df from time import time, strftime, gmtime tb = Blobber(pos_tagger=PerceptronTagger()) import bs4 as bs import urllib.request from pandas import DataFrame, Series # Get ~100 most popular urls from wikipedia def most_pop_urls_wiki(): try: source = urllib.request.urlopen( 'https://en.wikipedia.org/wiki/List_of_most_popular_websites' ).read() soup = bs.BeautifulSoup(source, 'lxml') table = soup.find(class_="wikitable sortable")
def setUp(self): self.text = ("Simple is better than complex. " "Complex is better than complicated.") self.tagger = PerceptronTagger(load=False)
def make_recs(n, tweet_set): import fiona from textblob import TextBlob from textblob_aptagger import PerceptronTagger from textblob.sentiments import NaiveBayesAnalyzer records = [] schema = { 'geometry': 'Point', 'properties': { 'id': 'str', 'tweet_id': 'str', 'user_id': 'str', 'created_at': 'str', 'text': 'str', 'pol_alc': 'str', 'sent_alc': 'str', 'pol_tb': 'str', 'sub_tb': 'str', 'sent_nltk': 'str', 'sen_pos_n': 'str', 'sen_neg_n': 'str', 'pol_nltk': 'str', 'sub_nltk': 'str', 'category': 'str', 'cat_conf': 'str', 'cat_url': 'str' } } shp = fiona.open("D:\\data\\temp\\test" + str(n) + ".shp", 'w', 'ESRI Shapefile', schema, crs=from_epsg(4326)) c = 0 try: for tweet in tweet_set: if tweet.has_key('geo') and tweet['geo'].has_key('coordinates'): tweet['geo']['coordinates'] = [ tweet['geo']['coordinates'][1], tweet['geo']['coordinates'][0] ] timestamp = datetime.datetime.strptime( tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y') time = timestamp.isoformat('T') try: tb = TextBlob(tweet['text'].lower(), pos_tagger=PerceptronTagger()) nb = TextBlob(tweet['text'], analyzer=NaiveBayesAnalyzer()) pol_tb = str(tb.polarity) sub_tb = str(tb.subjectivity) pol_nb = str(nb.polarity) sub_nb = str(nb.subjectivity) sent_p_nb = str(nb.sentiment[1]) sent_n_nb = str(nb.sentiment[2]) sent_c_nb = str(nb.sentiment[0]) except: pol_tb = sub_tb = pol_nb = sub_nb = sent_p_nb = sent_n_nb = sent_c_nb = "NULL" sent = None pol = None sent_type = None cat = None catg = None cat_sc = None cat_url = None try: sent = calc_sentiment(tweet['text']) pol = sent['docSentiment']['score'] sent_type = sent['docSentiment']['type'] cat = api.category('text', tweet['text']) catg = cat['category'] cat_sc = cat['score'] cat_url = cat['url'] except: pass if cat_url is None or len(cat_url) < 1: cat_url = 'NULL' if pol is None or len(pol) < 1: pol = 'NULL' if cat_sc is None or len(cat_sc) < 1: cat_sc = 'NULL' if catg is None or len(catg) < 1: catg = 'NULL' if sent_type is None or len(sent_type) < 1: sent_type = 'NULL' rec = { 'geometry': tweet['geo'], 'properties': { 'tweet_id': str(tweet['id']), 'user_id': str(tweet['user']['id']), 'created_at': time, 'id': c, 'text': tweet['text'], 'pol_alc': pol, 'sent_alc': sent_type, 'category': catg, 'cat_conf': cat_sc, 'cat_url': cat_url, 'pol_tb': pol_tb, 'sub_tb': sub_tb, 'sent_nltk': sent_c_nb, 'sen_pos_n': sent_p_nb, 'sen_neg_n': sent_n_nb, 'pol_nltk': pol_nb, 'sub_nltk': sub_nb } } print rec #print shp.validate_record(rec) #print shp.validate_record_geometry(rec) shp.write(rec) c += 1 except: pass shp.close()
# TextRank, based on: # http://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf from itertools import tee, izip from nltk import stem from text.blob import TextBlob as tb from textblob_aptagger import PerceptronTagger import nltk.data import numpy as np import sys TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle') TAGGER = PerceptronTagger() STEMMER = stem.porter.PorterStemmer() def pos_tag(s): """high-performance part-of-speech tagger""" global TAGGER return TAGGER.tag(s) def wrap_words(pair): """wrap each (word, tag) pair as an object with fully indexed metadata""" global STEMMER index = pair[0] result = [] for word, tag in pair[1]: word = word.lower() stem = STEMMER.stem(word) if stem == "":