def test_tag(self):
     trained_tagger = PerceptronTagger()
     tokens = trained_tagger.tag(self.text)
     assert_equal([w for w, t in tokens], [
         'Simple', 'is', 'better', 'than', 'complex', '.', 'Complex', 'is',
         'better', 'than', 'complicated', '.'
     ])
Пример #2
0
def main(args):
    f = open(args.filename)
    D = {}
    tag_set = set([])
    tb = Blobber(pos_tagger=PerceptronTagger())
    for i, line in enumerate(f):
        b1 = tb(line)
        for w, t in b1.tags:
            tag_set.add(t)
            if w not in D:
                D[w] = Counter()
            D[w][t] = float(D[w][t] + 1)

    sorted_pos_tags = sorted(list(tag_set))
    rows = []
    for w in D.keys():
        row = [w]
        pos_counts_word = np.array([float(D[w][t]) for t in sorted_pos_tags])
        pos_dist_word = pos_counts_word / float(np.sum(pos_counts_word))
        assert (np.isclose(np.sum(pos_dist_word), 1.0))
        row = row + list(pos_dist_word)
        rows.append(row)

    header = ['word'] + sorted_pos_tags
    print("Set of POS tags in sorted order", header)
    df = pd.DataFrame().from_records(rows, columns=header)
    print("Dumping the POS distribution.")
    df.to_csv(args.outputfile, index=None, encoding='utf-8')
Пример #3
0
def ExtendText(fileName, tagger=PerceptronTagger()):
    with io.open(fileName, 'r') as w:
        text = TextBlob(w.read(), pos_tagger=tagger)
        extended_text = []
        for sent in text.sentences:
            for word in sent.pos_tags:
                #word = "bank"
                penn_tags = ['JJ', 'NN', 'V']
                extending = False
                for tag in penn_tags:
                    if tag in word[1]:
                        extending = True
                        pos = tag[0].lower()
                        try:
                            l = lesk(sent.string, word[0].lower(), pos)
                            syns = l._lemma_names
                            for syn in syns:
                                extended_text.append(syn)
                            break
                        except:
                            extended_text.append(word[0].lower())
                if not extending:
                    extended_text.append(word[0].lower())
        extended_text = ' '.join([
            word for word in extended_text if word not in cachedStopWords
        ]).lstrip()
        return extended_text
class TestPerceptronTagger(unittest.TestCase):
    def setUp(self):
        self.text = ("Simple is better than complex. "
                     "Complex is better than complicated.")
        self.tagger = PerceptronTagger(load=False)

    def test_init(self):
        tagger = PerceptronTagger(load=False)
        assert_true(isinstance(tagger, BaseTagger))

    def test_train(self):
        sentences = _read_tagged(_wsj_train)
        nr_iter = 5
        self.tagger.train(sentences, nr_iter=nr_iter)
        nr_words = sum(len(words) for words, tags in sentences)
        # Check that the model has 'ticked over' once per instance
        assert_equal(nr_words * nr_iter, self.tagger.model.i)
        # Check that the tagger has a class for every seen tag
        tag_set = set()
        for _, tags in sentences:
            tag_set.update(tags)
        assert_equal(len(tag_set), len(self.tagger.model.classes))
        for tag in tag_set:
            assert_true(tag in self.tagger.model.classes)

    @attr("slow")
    def test_tag(self):
        trained_tagger = PerceptronTagger()
        tokens = trained_tagger.tag(self.text)
        assert_equal([w for w, t in tokens], [
            'Simple', 'is', 'better', 'than', 'complex', '.', 'Complex', 'is',
            'better', 'than', 'complicated', '.'
        ])

    @attr("slow")
    def test_tag_textblob(self):
        trained_tagger = PerceptronTagger()
        blob = TextBlob(self.text, pos_tagger=trained_tagger)
        # Punctuation is excluded
        assert_equal([w for w, t in blob.tags], [
            'Simple', 'is', 'better', 'than', 'complex', 'Complex', 'is',
            'better', 'than', 'complicated'
        ])

    def test_loading_missing_file_raises_missing_corpus_exception(self):
        tagger = PerceptronTagger(load=False)
        assert_raises(MissingCorpusError, tagger.load, 'missing.pickle')
Пример #5
0
class TestPerceptronTagger(unittest.TestCase):

    def setUp(self):
        self.text = ("Simple is better than complex. "
                     "Complex is better than complicated.")
        self.tagger = PerceptronTagger(load=False)

    def test_init(self):
        tagger = PerceptronTagger(load=False)
        assert_true(isinstance(tagger, BaseTagger))

    def test_train(self):
        sentences = _read_tagged(_wsj_train)
        nr_iter = 5
        self.tagger.train(sentences, nr_iter=nr_iter)
        nr_words = sum(len(words) for words, tags in sentences)
        # Check that the model has 'ticked over' once per instance
        assert_equal(nr_words * nr_iter, self.tagger.model.i)
        # Check that the tagger has a class for every seen tag
        tag_set = set()
        for _, tags in sentences:
            tag_set.update(tags)
        assert_equal(len(tag_set), len(self.tagger.model.classes))
        for tag in tag_set:
            assert_true(tag in self.tagger.model.classes)

    @attr("slow")
    def test_tag(self):
        trained_tagger = PerceptronTagger()
        tokens = trained_tagger.tag(self.text)
        assert_equal([w for w, t in tokens],
            ['Simple', 'is', 'better', 'than', 'complex', '.', 'Complex', 'is',
             'better', 'than', 'complicated', '.'])

    @attr("slow")
    def test_tag_textblob(self):
        trained_tagger = PerceptronTagger()
        blob = TextBlob(self.text, pos_tagger=trained_tagger)
        # Punctuation is excluded
        assert_equal([w for w, t in blob.tags],
            ['Simple', 'is', 'better', 'than', 'complex', 'Complex', 'is',
             'better', 'than', 'complicated'])

    def test_loading_missing_file_raises_missing_corpus_exception(self):
        tagger = PerceptronTagger(load=False)
        assert_raises(MissingCorpusException, tagger.load, 'missing.pickle')
 def test_tag_textblob(self):
     trained_tagger = PerceptronTagger()
     blob = TextBlob(self.text, pos_tagger=trained_tagger)
     # Punctuation is excluded
     assert_equal([w for w, t in blob.tags], [
         'Simple', 'is', 'better', 'than', 'complex', 'Complex', 'is',
         'better', 'than', 'complicated'
     ])
Пример #7
0
def pronounSub(string):
    tokens = TextBlob(string, pos_tagger=PerceptronTagger())
    for word, speech in tokens.tags:
        sub = ""
        if speech == "PRP": sub == "I"
        if speech == "PRP$": sub == "My"
        string = re.sub(speech, sub, string, re.I)
    return string
Пример #8
0
 def get_perceptron_tagger(self):
     """
     Perform preprocessing (shallow parsing) by state-of-the-art PerceptronTagger (98.8% accuracy)
     However, it stripped punctuations to avoid ambiguity
     http://stevenloria.com/tutorial-state-of-the-art-part-of-speech-tagging-in-textblob/
     """
     from textblob import Blobber
     from textblob_aptagger import PerceptronTagger
     tb = Blobber(pos_tagger=PerceptronTagger())
     return tb
    def __init__(self, winSize, lex = False):
        self.winSize = winSize
        self.lex = lex
        self.lexFilename = "Lex" if lex else ""
        self.winMod = WindowProb("C:/MissingWord/post"+self.lexFilename+"ModComp"+str(self.winSize)+".pickle", compressed = True)
        self.winOrig = WindowProb("C:/MissingWord/post"+self.lexFilename+"Comp"+str(self.winSize)+".pickle", compressed = True)

        with open("toLexicalize.pickle", "rb") as f:
            self.toLexicalize = pickle.load(f)

        self.aptagger = PerceptronTagger()
Пример #10
0
def posDist(text):
    text = " ".join(text)
    tokens = Tokenize.byWord(text)
    POStags = [tag for word, tag in TaggingTools.tagPOS(text)]
    possibleTags = PerceptronTagger().model.classes
    vector = {}
    total = 0
    for tag in possibleTags:
        vector[tag] = 0
    for tag in POStags:
        vector[tag] += 1
        total += 1
    for tag in possibleTags:
        vector[tag] = int(100 * vector[tag] / total)
    return vector
Пример #11
0
def cal_sentiment(tweet_set):
    for tweet in tweet_set:
        tb = TextBlob(tweet['text'].lower(), pos_tagger=PerceptronTagger())
        nb = TextBlob(tweet['text'], analyzer=NaiveBayesAnalyzer())
        pol_tb = str(tb.polarity)
        sub_tb = str(tb.subjectivity)
        pol_nb = str(nb.polarity)
        sub_nb = str(nb.subjectivity)
        sent_p_nb = str(nb.sentiment[1])
        sent_n_nb = str(nb.sentiment[2])
        sent_c_nb = str(nb.sentiment[0])
        sent = [
            pol_tb, sub_tb, pol_nb, sub_nb, sent_p_nb, sent_n_nb, sent_c_nb
        ]
        return sent
Пример #12
0
def relevance(file):
    hvs = GetHighVectorLogSums('high')

    percepticon = PerceptronTagger()
    cat_dict = defaultdict(int)
    files = fio.recGetTextFiles('training')
    file_sum = 0
    extended_text = ExtendText(file, percepticon)
    word_count = 0
    with open(file, 'r') as f:
        word_count = len(f.read().split())
    for term in extended_text.split():
        if term in hvs.keys():
            file_sum += hvs[term]

    file_sum = file_sum / len(extended_text.split())
    return file_sum
Пример #13
0
def GetVectors():
    essay_path = 'training'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    percepticon = PerceptronTagger()
    cat_dict = defaultdict(int)
    for f in files:
        extended_text = ExtendText(f, percepticon)
        name = ''
        cats = ['high', 'medium', 'low']
        for cat in cats:
            if cat in f:
                name = cat + str(cat_dict[cat])
                cat_dict[cat] += 1
        docs.append(Document(extended_text, name=name, top=None))
    m = Model(docs)
    #lsa = m.reduce(5)
    return m
Пример #14
0
    def __init__(self, winSize=5, lex=False, compFile=False):
        self.lex = lex
        self.lexFilename = "lex" if lex else ""
        comp = "Comp" if lex else "comp"
        if not compFile:
            comp = ""
        self.winSize = winSize
        self.winOrig = WindowProb("C:/MissingWord/" + self.lexFilename + comp +
                                  str(self.winSize) + ".pickle",
                                  compressed=compFile)

        with open("lexTags.pickle", "rb") as f:
            self.allLexTags = pickle.load(
                f
            )  #inefficient to use a lexicalized set, but it will still work for unlexicalized models

        with open("toLexicalize.pickle", "rb") as f:
            self.toLexicalize = pickle.load(f)

        self.tagger = PerceptronTagger()
Пример #15
0
def main(args):
    lines = ioutils.load_word_list(args.filename)
    # f = open(args.filename)
    D = {}
    tag_set = set([])
    tb = Blobber(pos_tagger=PerceptronTagger())
    for i, line in enumerate(lines):
        b1 = tb(line)
        for w, t in b1.tags:
            tag_set.add(t)
            if w not in D:
                D[w] = Counter()
            D[w][t] = float(D[w][t] + 1)
    # print D['fawn'].most_common(1)[0]
    # print D['yellow'].most_common(1)[0]

    sorted_pos_tags = sorted(list(tag_set))
    rows = []
    most_common_rows = []
    for w in D.keys():
        row = [w]
        pos_counts_word = np.array([float(D[w][t]) for t in sorted_pos_tags])
        pos_dist_word = pos_counts_word / float(np.sum(pos_counts_word))
        assert (np.isclose(np.sum(pos_dist_word), 1.0))
        row = row + list(pos_dist_word)
        rows.append(row)
        most_common_rows.append([
            w,
            np.max(pos_counts_word),
            sorted_pos_tags[np.argmax(pos_counts_word)]
        ])

    header = ['word'] + sorted_pos_tags
    print "Set of POS tags in sorted order", header
    df = pd.DataFrame().from_records(rows, columns=header)
    print "Dumping the POS distribution."
    df.to_csv(args.outputfile + ".csv", index=None, encoding='utf-8')
    print "Dumping most common pos tag"
    df2 = pd.DataFrame().from_records(most_common_rows,
                                      columns=['word', 'count', 'POS'])
    df2.to_csv(args.outputfile + "_pos.csv", index=None, encoding='utf-8')
 def __init__(self):
     # create custom components
     self.naive_bayes_analyzer = NaiveBayesAnalyzer()
     self.conll_extractor = ConllExtractor()
     self.nltk_tagger = NLTKTagger()
     self.perceptron_tagger = PerceptronTagger()
     if DEV_ENV:
         return
     # train all components (default and custom)
     text = 'TextBlob blobs great!'
     default_blob = TextBlob(text)
     default_blob.sentiment
     default_blob.noun_phrases
     default_blob.pos_tags
     custom_blob = TextBlob(text,
                            analyzer=self.naive_bayes_analyzer,
                            np_extractor=self.conll_extractor,
                            pos_tagger=self.nltk_tagger)
     custom_blob.sentiment
     custom_blob.noun_phrases
     custom_blob.pos_tags
     custom2_blob = TextBlob(text, pos_tagger=self.perceptron_tagger)
     custom2_blob.pos_tags
Пример #17
0
def parse_doc(json_iter, pos_tagger=None, force_encode=False):
    """parse one document to prep for TextRank"""

    global DEBUG, POS_TAGGER

    # set up the PoS tagger, defaults to PerceptronTagger from TextBlob
    if not pos_tagger:
        if not POS_TAGGER:
            POS_TAGGER = PerceptronTagger()
        pos_tagger = POS_TAGGER

    for meta in json_iter:
        base_idx = 0

        for graf_text in filter_quotes(meta["text"], is_email=False):
            if DEBUG:
                print("graf_text:", graf_text)

            grafs, new_base_idx = parse_graf(meta["id"], graf_text, base_idx,
                                             pos_tagger, force_encode)
            base_idx = new_base_idx

            for graf in grafs:
                yield graf
Пример #18
0
 def __init__(self):
     self.lex = self.load_lexicon()
     self.blobber = Blobber(pos_tagger=PerceptronTagger())
Пример #19
0
def tagPOS(text):    
    blob = TextBlob(text, pos_tagger=PerceptronTagger())
    return blob.tags     
Пример #20
0

def GetHighVectorLogSums(label):
    m = GetVectors()
    high_tf_sums = defaultdict(float)
    for corpuscle in m._documents:
        if label in corpuscle._name:
            for tf in corpuscle.vector:
                high_tf_sums[tf] -= math.log(corpuscle.vector[tf])
    return high_tf_sums


if __name__ == '__main__':
    hvs = GetHighVectorLogSums('high')

    percepticon = PerceptronTagger()
    cat_dict = defaultdict(int)
    files = fio.recGetTextFiles(r'C:\Users\William\Desktop\421_Final\training')
    file_sums = []
    for file in files:
        file_sum = 0
        extended_text = ExtendText(file, percepticon)
        word_count = 0
        with open(file, 'r') as f:
            word_count = len(f.read().split())
        for term in extended_text.split():
            #learn below weights through experimentation
            if term in hvs.keys():
                file_sum += hvs[term]
        file_sums.append(file_sum / len(extended_text.split()))
    print('mean' + str(mean(file_sums)))
Пример #21
0
import pandas as pd
from textblob import TextBlob
from textblob_aptagger import PerceptronTagger

NEED_POS = [
    'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG',
    'VBN'
]


def remove_extra_tags(tags_list):
    return_tags_list = []
    for t in tags_list:
        if t[1] in NEED_POS:
            return_tags_list.append(t)
    return return_tags_list


if __name__ == '__main__':
    # file_name = '../../testReviews.csv'
    file_name = '../../MProductReviewsLatest.csv'
    reviews = pd.read_csv(file_name)
    reviews['postagged_body'] = reviews['Body'].map(
        lambda x: TextBlob(x, pos_tagger=PerceptronTagger()).tags)
    reviews['postagged_body'] = reviews['postagged_body'].map(
        lambda x: remove_extra_tags(x))
    reviews.to_csv('../../MProductReviewsLatestPOStagged.csv', sep='\t')
Пример #22
0
poly = fiona.open('D:\data\open_toronto\NEIGHBORHOODS_WGS84.shp')
#
for rex in poly:
    p = geometry.shape(rex['geometry'])
    xy = zip(p.boundary.xy[1], p.boundary.xy[0])
    users = toronto_tweets.find({
        'geo.coordinates': {
            "$near": xy[4]
        }
    }).distinct('user.id')
    for user_id in users:
        for tweet in toronto_tweets.find({'user.id': user_id}):

            if not tweet.has_key('pol_tb'):
                tb = TextBlob(tweet['text'].lower(),
                              pos_tagger=PerceptronTagger())
                nb = TextBlob(tweet['text'], analyzer=NaiveBayesAnalyzer())
                pol_tb = tb.polarity
                sub_tb = tb.subjectivity
                pol_nb = nb.polarity
                sub_nb = nb.subjectivity
                sent_p_nb = nb.sentiment[1]
                sent_n_nb = nb.sentiment[2]
                sent_c_nb = nb.sentiment[0]
                tweet['pol_tb'] = pol_tb
                tweet['sub_tb'] = sub_tb
                tweet['sent_nltk'] = sent_c_nb
                tweet['sen_pos_n'] = sent_p_nb
                tweet['sen_neg_n'] = sent_n_nb
                tweet['pol_nltk'] = pol_nb
                tweet['sub_nltk'] = sub_nb
Пример #23
0
import pickle
from collections import Counter
import generateTagWindows
from textblob import TextBlob
from textblob_aptagger import PerceptronTagger
import math
from WindowProb import WindowProb
import statistics
import numpy
import lexicalizedTagWindows

WIN_SIZE = 5
WIN_OFFSET = int((WIN_SIZE - 1) / 2)

aptagger = PerceptronTagger()


def removeWord(tokens):
    removed = []
    for i in range(1, len(tokens) - 1):
        removed.append([token for ind, token in enumerate(tokens) if ind != i])
    return removed


def modLikelihood(allLexTags, modProb4, windowProb5, tags, tagIndex):
    probs = []
    for alt in allLexTags:
        win4 = generateTagWindows.makeWindow(tags,
                                             begin=tagIndex - 2,
                                             end=tagIndex + 2)
Пример #24
0
 def benchmaking(self):
     test = [[(u'Pierre', u'NNP'), (u'Vinken', u'NNP'), (u',', u','), (u'61', u'CD'),
         (u'years', u'NNS'), (u'old', u'JJ'), (u',', u','), (u'will', u'MD'),
         (u'join', u'VB'), (u'the', u'DT'), (u'board', u'NN'), (u'as', u'IN'),
         (u'a', u'DT'), (u'nonexecutive', u'JJ'), (u'director', u'NN'),
         (u'Nov.', u'NNP'), (u'29', u'CD'), (u'.', u'.')],
     [(u'Mr.', u'NNP'), (u'Vinken', u'NNP'), (u'is', u'VBZ'), (u'chairman', u'NN'),
         (u'of', u'IN'), (u'Elsevier', u'NNP'), (u'N.V.', u'NNP'), (u',', u','),
         (u'the', u'DT'), (u'Dutch', u'NNP'), (u'publishing', u'VBG'),
         (u'group', u'NN'), (u'.', u'.'), (u'Rudolph', u'NNP'), (u'Agnew', u'NNP'),
         (u',', u','), (u'55', u'CD'), (u'years', u'NNS'), (u'old', u'JJ'),
         (u'and', u'CC'), (u'former', u'JJ'), (u'chairman', u'NN'), (u'of', u'IN'),
         (u'Consolidated', u'NNP'), (u'Gold', u'NNP'), (u'Fields', u'NNP'),
         (u'PLC', u'NNP'), (u',', u','), (u'was', u'VBD'), (u'named', u'VBN'),
         (u'a', u'DT'), (u'nonexecutive', u'JJ'), (u'director', u'NN'), (u'of', u'IN'),
         (u'this', u'DT'), (u'British', u'JJ'), (u'industrial', u'JJ'),
         (u'conglomerate', u'NN'), (u'.', u'.')],
     [(u'A', u'DT'), (u'form', u'NN'),
         (u'of', u'IN'), (u'asbestos', u'NN'), (u'once', u'RB'), (u'used', u'VBN'),
         (u'to', u'TO'), (u'make', u'VB'), (u'Kent', u'NNP'), (u'cigarette', u'NN'),
         (u'filters', u'NNS'), (u'has', u'VBZ'), (u'caused', u'VBN'), (u'a', u'DT'),
         (u'high', u'JJ'), (u'percentage', u'NN'), (u'of', u'IN'),
         (u'cancer', u'NN'), (u'deaths', u'NNS'),
         (u'among', u'IN'), (u'a', u'DT'), (u'group', u'NN'), (u'of', u'IN'),
         (u'workers', u'NNS'), (u'exposed', u'VBN'), (u'to', u'TO'), (u'it', u'PRP'),
         (u'more', u'RBR'), (u'than', u'IN'), (u'30', u'CD'), (u'years', u'NNS'),
         (u'ago', u'IN'), (u',', u','), (u'researchers', u'NNS'),
         (u'reported', u'VBD'), (u'.', u'.')]]
     """
         [(u'A', u'DT'), (u'forge', u'NN'),
         (u'is', u'VBZ'), (u'a', u'DT'), (u'type', u'NN'), (u'of', u'IN'),
         (u'hearth', u'JJ'), (u'used', u'VBN'), (u'for', u'IN'), (u'heating', u'NN'),
         (u'metals', u'NNS'), (u'.', u'.'), (u'or', u'CC'), (u'the', u'DT'),
         (u'workplace', u'NN'), (u'(', u'('), (u'smithy', u'JJ'),
         (u')', u')'), (u'where', u'WRB'),
         (u'such', u'JJ'), (u'a', u'DT'), (u'hearth', u'JJ'), (u'is', u'VBZ'),
         (u'located', u'VBN'), (u'.', u'.')]"""
     from textblob_aptagger import PerceptronTagger
     import nltk
     
     print("perceptron tagger accuracy based on conll2000: ",self.pos_accuracy([nltk.corpus.conll2000.tagged_words()[:30], 
                                                               nltk.corpus.conll2000.tagged_words()[30:60],
                                                               nltk.corpus.conll2000.tagged_words()[60:90],
                                                               nltk.corpus.conll2000.tagged_words()[90:120],
                                                               nltk.corpus.conll2000.tagged_words()[120:150],
                                                               nltk.corpus.conll2000.tagged_words()[300:330]], PerceptronTagger()))
     print("NLTK pos tagger accuracy based on conll2000: ", self.pos_accuracy([nltk.corpus.conll2000.tagged_words()[:30], 
                                                               nltk.corpus.conll2000.tagged_words()[30:60],
                                                               nltk.corpus.conll2000.tagged_words()[60:90],
                                                               nltk.corpus.conll2000.tagged_words()[90:120],
                                                               nltk.corpus.conll2000.tagged_words()[120:150],
                                                               nltk.corpus.conll2000.tagged_words()[300:330]], self.nltk_pos_tag()))
                    
     '''print("NLTK pos tagger accuracy based on brown corpus: ", self.pos_accuracy([nltk.corpus.brown.tagged_words()[:30], 
                                                               nltk.corpus.brown.tagged_words()[30:60],
                                                               nltk.corpus.brown.tagged_words()[60:90],
                                                               nltk.corpus.brown.tagged_words()[90:120],
                                                               nltk.corpus.brown.tagged_words()[120:150],
                                                               nltk.corpus.brown.tagged_words()[300:330]], self.nltk_pos_tag()))'''
     
     print("perceptron tagger accuracy based on test data: ",self.pos_accuracy(test, PerceptronTagger()))
     print("NLTK pos tagger accuracy based on test data: ", self.pos_accuracy(test, self.nltk_pos_tag()))
     '''print("NLTK pos tagger accuracy based on brown corpus: ", self.pos_accuracy([nltk.corpus.brown.tagged_words()[:30], 
Пример #25
0
## buildtree.py
## Author: Yangfeng Ji
## Date: 09-10-2014
## Time-stamp: <yangfeng 09/29/2014 15:15:23>

from datastructure import *
from textblob import TextBlob
from textblob_aptagger import PerceptronTagger
from util import extractrelation
from maltparser import get_head_words

perceptron_tagger = PerceptronTagger()

def BFT(tree):
    """ Breadth-first treavsal on general RST tree

    :type tree: SpanNode instance
    :param tree: an general RST tree
    """
    queue = [tree]
    bft_nodelist = []
    while queue:
        node = queue.pop(0)
        bft_nodelist.append(node)
        queue += node.nodelist
    return bft_nodelist


def BFTbin(tree):
    """ Breadth-first treavsal on binary RST tree
Пример #26
0
def process_twitter(actionability_ranking: pd.DataFrame):
    """Filter actionable items.

    :param data:
    :return:
    """
    ### LOAD POS TAGGER TO HELP WITH ACTIONABILITY SCORING
    tb = Blobber(pos_tagger=PerceptronTagger())

    ### CREATE EMPTY LIST FOR SCORES FROM TEXT ANALYSIS
    language_scores = []

    ### WE DONT WANT PAST TENSE VERBS AND ADVERBS INDICATE NEWS
    bad_verbs = ['VBZ', 'VBN', 'VBD', 'RB']
    good_verbs = ['VB', 'VBG', 'VBP', 'JJ']

    # Part of speech tag each tweet
    for tweet in actionability_ranking['tweet'].tolist():
        tagged = tb(tweet.lower())
        tag_list = [x[1] for x in tagged.tags]
        score = 0

        # Penalize tweets with structures that are known to be now what we're looking for
        # POS Tags available here: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
        if tag_list[:2] == [
                'NNP',
                'POS',
        ]:
            score -= 4
        if tag_list[:3] == ['NN', 'POS', 'NN']:
            score -= 10
        if tag_list[:3] == ['NN', 'NN', 'TO']:
            score -= 10
        if tag_list[:3] == ['NN', 'TO', 'VB']:
            score -= 10
        if tag_list[:3] == ['NN', 'NN', 'VBZ']:
            score -= 10
        if tag_list[:3] == ['NN', 'NN', 'JJ']:
            score -= 10
        if tag_list[:3] == ['NN', 'JJ', 'NN']:
            score -= 10

        if tag_list[:3] == ['JJ', 'NNS', 'VBP']:
            score -= 4
        elif tag_list[:3] == ['JJ', 'NN', 'VBG']:
            score -= 4
        elif tag_list[:3] == ['JJS', 'NN', 'JJ']:
            score -= 4
        elif tag_list[:3] == ['NN', 'NN', 'NN']:
            score -= 4
        elif tag_list[:3] == ['JJ', 'NN', 'NN']:
            score -= 4
        elif tag_list[:3] == ['JJ', 'NN', 'NNS']:
            score -= 4
        elif tag_list[:3] == ['JJ', 'NN', 'VB']:
            score -= 10
        elif tag_list[:3] == ['IN', 'JJ', 'NN']:
            score -= 10
        elif tag_list[0] == 'JJ':
            score += 3

        if tag_list[0] == 'VB':
            score += 4

        # Penalize tweets with words in them that indicate they are not useful
        if 'yesterday' in tweet.lower():
            score -= 10
        if 'last week' in tweet.lower():
            score -= 10
        if 'video' in tweet.lower():
            score -= 10
        if 'news search' in tweet.lower():
            score -= 10
        if 'town hall in facebook' in tweet.lower():
            score -= 20
        if 'townhall in facebook' in tweet.lower():
            score -= 20
        if "facebook's" in tweet.lower() and "town hall" in tweet.lower():
            score -= 20
        if 'cnn town hall' in tweet.lower():
            score -= 20
        if 'online  paper' in tweet.lower():
            score -= 20
        if 'join us' in tweet.lower():
            score -= 10
        if 'dreamhome' in tweet.lower():
            score -= 20
        if 'sales' in tweet.lower():
            score -= 20
        if 'relaxing' in tweet.lower():
            score -= 20
        if 'ice ice baby' in tweet.lower():
            score -= 20
        if 'interior design' in tweet.lower():
            score -= 20
        if 'canada' in tweet.lower():
            score -= 20
        if 'toyota' in tweet.lower():
            score -= 20
        if 'minister' in tweet.lower():
            score -= 20
        if 'uk' in tweet.lower():
            score -= 20
        if 'MP' in tweet:
            score -= 20
        if 'eu' in tweet.lower():
            score -= 20
        if 'england' in tweet.lower():
            score -= 20
        if 'furniture' in tweet.lower():
            score -= 20
        if 'kitchen' in tweet.lower():
            score -= 20
        if 'germany' in tweet.lower():
            score -= 20
        if 'south africa' in tweet.lower():
            score -= 20
        if 'News' in tweet:
            score -= 10
        if 'daily beast' in tweet.lower():
            score -= 20
        if '#design' in tweet.lower():
            score -= 20
        if '#interior' in tweet.lower():
            score -= 20
        if 'radicalisation' in tweet.lower():
            score -= 20
        if 'militancy' in tweet.lower():
            score -= 20
        if 'sharia' in tweet.lower():
            score -= 20
        if 'uncontrolled' in tweet.lower():
            score -= 20
        if 'enjoy a' in tweet.lower():
            score -= 20
        if tweet.lower().startswith('the latest'):
            score -= 20
        if 'wapo' in tweet.lower():
            score -= 20
        if 'nytimes' in tweet.lower():
            score -= 20
        if '5 things' in tweet.lower():
            score -= 20
        if 'poland' in tweet.lower():
            score -= 20
        if 'hungary' in tweet.lower():
            score -= 20
        if 'slovakia' in tweet.lower():
            score -= 20
        if 'czech' in tweet.lower():
            score -= 20
        if 'egypt' in tweet.lower():
            score -= 20
        if 'austria' in tweet.lower():
            score -= 20
        if 'germany' in tweet.lower():
            score -= 20
        if 'hiring' in tweet.lower():
            score -= 20
        if 'ice show' in tweet.lower():
            score -= 20
        if 'ice cream' in tweet.lower():
            score -= 20
        if 'secure border' in tweet.lower():
            score -= 20
        if 'hire thousands' in tweet.lower():
            score -= 20
        if 'demand congress hire' in tweet.lower():
            score -= 20
        if 'illegals' in tweet.lower():
            score -= 20
        if 'icecream' in tweet.lower():
            score -= 20
        if 'avalanche' in tweet.lower():
            score -= 20
        if 'ice cold' in tweet.lower():
            score -= 20
        if 'snow' in tweet.lower():
            score -= 20
        if 'alex jones' in tweet.lower():
            score -= 1000
        if 'viral' in tweet.lower():
            score -= 20
        if 'scotland' in tweet.lower():
            score -= 20
        if 'brexit' in tweet.lower():
            score -= 20
        if 'caretoclick' in tweet.lower():
            score -= 10
        if 'ministry' in tweet.lower():
            score -= 20
        if 'ugh' in tweet.lower():
            score -= 30
        if 'london' in tweet.lower():
            score -= 30
        if 'wales' in tweet.lower():
            score -= 30
        if '@youtube' in tweet.lower():
            score -= 20
        if 'breitbart' in tweet.lower():
            score -= 20
        if 'hypocrite' in tweet.lower():
            score -= 20
        if 'moron' in tweet.lower():
            score -= 20
        if '!!!' in tweet.lower():
            score -= 10
        if 'Take Action: Sign Petition' in tweet:
            score -= 50
        if 'petition' in tweet.lower():
            score -= 15
        if 'signandshareorg' in tweet.lower():
            score -= 10

        if re.findall(profanity_regex, tweet.lower()):
            score -= 50

        # Penalize tweets with tons of hashtags
        if tweet.count('#') > 2:
            score -= (tweet.count('#') - 2) * 5

        # Penalize tweets with tons of mentions
        if tweet.count('@') > 2:
            score -= (tweet.count('@') - 2) * 5

        # Reward tweets with good verbs and no bad verbs
        # PENALIZE TWEETS WITH BAD VERBS AND FEW GOOD VERBS
        verb_score = 0
        for tag in tag_list:
            if tag in bad_verbs:
                verb_score -= 2
            if tag in good_verbs:
                verb_score += 1
        score += int(verb_score * 1.0 / len(tag_list)) * 4

        # Reward polite tweets that encourage action
        if tweet.lower().startswith('please'):
            score += 10

        # Other score adjustments
        # Reward tweets that are longer
        score += int(len(tweet) / 60) * 4

        language_scores.append(score)

    ### ADD LANGUAGE SCORE TO PANDAS DF
    actionability_ranking['pos_score'] = np.asarray(language_scores)

    ### ADD ACTIONABILITY SCORE TO PANDAS DF BASED ON FIELDS WE EXTRACTED
    actionability_ranking['actionability_score'] = (
        np.where(actionability_ranking['tweet_cities'] == '', 0, 15) +
        np.where(actionability_ranking['tweet_states'] == '', 0, 15) +
        np.where(actionability_ranking['tweet_urls'] == '', 0, 5) +
        np.where(actionability_ranking['tweet_phone_numbers'] == '', 0, 20) +
        np.where(actionability_ranking['tweet_dates_ref'] == '', 0, 10) +
        np.where(actionability_ranking['tweet_legislator_names'] == '', 0, 15)
        + np.where(actionability_ranking['tweet_legislator_handles'] == '', 0,
                   15) +
        np.where(actionability_ranking['tweet'].str.startswith('@'), -10, 0) +
        np.where(actionability_ranking['tweet'].str.startswith('.@'), -10, 0))

    ### CALCULATE THE TOTAL SCORE
    actionability_ranking['total_score'] = (
        actionability_ranking['es_score'] +
        actionability_ranking['actionability_score'] +
        actionability_ranking['pos_score'])

    ### FILTER THE DF BY TOTAL SCORE AND ELASTIC SEARCH RELEVANCE
    filtered_data = actionability_ranking.loc[
        (actionability_ranking['total_score'] > 8.5)
        & (actionability_ranking['es_score'] > 7.0)]
    filtered_tweet_list = filtered_data['tweet'].tolist()
    filtered_score_list = filtered_data['total_score'].tolist()
    filtered_es_score_list = filtered_data['es_score'].tolist()

    ### DE-DUPLICATE USING EDIT-DISTANCE < 60 EDITS AS A FILTER
    ### THIS PART TAKES A WHILE: TWEETS^2 COMPARISONS. THAT'S WHY WE FILTER FIRST.
    distance_dict = {}

    for i, tweet in enumerate(filtered_tweet_list):
        for j, tweet2 in enumerate(filtered_tweet_list):
            tweet_ids = tuple(sorted([i, j]))
            if i == j:
                pass
            else:
                distance = editdistance.eval(tweet, tweet2)
                if distance <= 60:
                    distance_dict[tweet_ids] = distance

    ### FOR DUPLICATES, WE'LL TAKE THE MORE ACTIONABLE/RELEVANT OF THE TWO
    delete_indices = []

    for (i, j), v in distance_dict.items():
        if filtered_score_list[i] >= filtered_score_list[j]:
            delete_indices.append(j)
        else:
            delete_indices.append(i)

    delete_indices = list(set(delete_indices))

    ### WE DELETE ROWS WITH THE INDICES THAT WERE FOUND TO BE DUPLICATED AND LESS ACTIONABLE
    filtered_data.drop(filtered_data.index[delete_indices],
                       inplace=True,
                       errors='ignore')

    ### REMOVE SOME OF THE UN-NEEDED COLUMNS BEFORE PUSHING INTO DRUPAL
    final_data = filtered_data[[
        u'issue', u'action', u'id', u'es_score', u'total_score', u'tweet',
        u'tweet_timestamp', u'query_timestamp', u'tweet_user', u'tweet_cities',
        u'tweet_states', u'tweet_urls', u'tweet_phone_numbers',
        u'tweet_dates_ref', u'tweet_legislator_names',
        u'tweet_legislator_handles'
    ]]

    #final_data.to_csv('/Users/brosskatz/PycharmProjects/rzst/w210_imwithdata/imwithdata/data/static_data/final_data_example.csv')

    return final_data
Пример #27
0
def get_textblob_tags(sentence):
    blob = TextBlob(sentence, pos_tagger=PerceptronTagger())
    return blob.tags
from functools import partial

import nltk
from knx.text.postagger.base import map_paren, reverse_map_paren

from BS.knx.text.tokenizer import default_tokenizer as tokenizer

try:
    from textblob_aptagger import PerceptronTagger
    perceptron_tagger = PerceptronTagger()

    SYMBOLS = {'@', '#', '%', '^', '*', '+', '=', '~'}

    # Replace the original tag method to support tokenized text


    def _tag(self, corpus, tokenize=True):
        """Tags a string `corpus`."""
        # Assume untokenized corpus has \n between sentences and ' ' between words
        s_split = nltk.sent_tokenize if tokenize else lambda text: [text]
        w_split = tokenizer.tokenize if tokenize else lambda sent: sent

        def split_sents(corpus):
            for s in s_split(corpus):
                yield map(map_paren, w_split(s))

        prev, prev2 = self.START
        has_open_left_single_quote = False
        tokens = []
        for words in split_sents(corpus):
            context = self.START + [self._normalize(w)
Пример #29
0
 def test_tag(self):
     trained_tagger = PerceptronTagger()
     tokens = trained_tagger.tag(self.text)
     assert_equal([w for w, t in tokens],
         ['Simple', 'is', 'better', 'than', 'complex', '.', 'Complex', 'is',
          'better', 'than', 'complicated', '.'])
Пример #30
0
from os import system, getcwd
from tweepy import OAuthHandler, API
from csv import reader
from codecs import iterdecode
import pickle
from string import punctuation

from tweet_tk.retweet_fetcher import retweet_cnt
from tweet_tk.bots import add_suspects, is_suspect
from tweet_tk.emoticons_parser import emoticons_score
from tweet_tk.tweet_sentiment import sentiment
from tweet_tk.tweets_to_df import tweet_json_to_df

from time import time, strftime, gmtime

tb = Blobber(pos_tagger=PerceptronTagger())

import bs4 as bs
import urllib.request
from pandas import DataFrame, Series


# Get ~100 most popular urls from wikipedia
def most_pop_urls_wiki():
    try:
        source = urllib.request.urlopen(
            'https://en.wikipedia.org/wiki/List_of_most_popular_websites'
        ).read()
        soup = bs.BeautifulSoup(source, 'lxml')

        table = soup.find(class_="wikitable sortable")
Пример #31
0
 def setUp(self):
     self.text = ("Simple is better than complex. "
                  "Complex is better than complicated.")
     self.tagger = PerceptronTagger(load=False)
Пример #32
0
def make_recs(n, tweet_set):
    import fiona
    from textblob import TextBlob
    from textblob_aptagger import PerceptronTagger
    from textblob.sentiments import NaiveBayesAnalyzer
    records = []
    schema = {
        'geometry': 'Point',
        'properties': {
            'id': 'str',
            'tweet_id': 'str',
            'user_id': 'str',
            'created_at': 'str',
            'text': 'str',
            'pol_alc': 'str',
            'sent_alc': 'str',
            'pol_tb': 'str',
            'sub_tb': 'str',
            'sent_nltk': 'str',
            'sen_pos_n': 'str',
            'sen_neg_n': 'str',
            'pol_nltk': 'str',
            'sub_nltk': 'str',
            'category': 'str',
            'cat_conf': 'str',
            'cat_url': 'str'
        }
    }
    shp = fiona.open("D:\\data\\temp\\test" + str(n) + ".shp",
                     'w',
                     'ESRI Shapefile',
                     schema,
                     crs=from_epsg(4326))
    c = 0
    try:
        for tweet in tweet_set:
            if tweet.has_key('geo') and tweet['geo'].has_key('coordinates'):
                tweet['geo']['coordinates'] = [
                    tweet['geo']['coordinates'][1],
                    tweet['geo']['coordinates'][0]
                ]
                timestamp = datetime.datetime.strptime(
                    tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                time = timestamp.isoformat('T')
                try:
                    tb = TextBlob(tweet['text'].lower(),
                                  pos_tagger=PerceptronTagger())
                    nb = TextBlob(tweet['text'], analyzer=NaiveBayesAnalyzer())
                    pol_tb = str(tb.polarity)
                    sub_tb = str(tb.subjectivity)
                    pol_nb = str(nb.polarity)
                    sub_nb = str(nb.subjectivity)
                    sent_p_nb = str(nb.sentiment[1])
                    sent_n_nb = str(nb.sentiment[2])
                    sent_c_nb = str(nb.sentiment[0])
                except:
                    pol_tb = sub_tb = pol_nb = sub_nb = sent_p_nb = sent_n_nb = sent_c_nb = "NULL"
                sent = None
                pol = None
                sent_type = None
                cat = None
                catg = None
                cat_sc = None
                cat_url = None
                try:
                    sent = calc_sentiment(tweet['text'])
                    pol = sent['docSentiment']['score']
                    sent_type = sent['docSentiment']['type']
                    cat = api.category('text', tweet['text'])
                    catg = cat['category']
                    cat_sc = cat['score']
                    cat_url = cat['url']

                except:
                    pass

                if cat_url is None or len(cat_url) < 1:
                    cat_url = 'NULL'
                if pol is None or len(pol) < 1:
                    pol = 'NULL'
                if cat_sc is None or len(cat_sc) < 1:
                    cat_sc = 'NULL'
                if catg is None or len(catg) < 1:
                    catg = 'NULL'
                if sent_type is None or len(sent_type) < 1:
                    sent_type = 'NULL'
                rec = {
                    'geometry': tweet['geo'],
                    'properties': {
                        'tweet_id': str(tweet['id']),
                        'user_id': str(tweet['user']['id']),
                        'created_at': time,
                        'id': c,
                        'text': tweet['text'],
                        'pol_alc': pol,
                        'sent_alc': sent_type,
                        'category': catg,
                        'cat_conf': cat_sc,
                        'cat_url': cat_url,
                        'pol_tb': pol_tb,
                        'sub_tb': sub_tb,
                        'sent_nltk': sent_c_nb,
                        'sen_pos_n': sent_p_nb,
                        'sen_neg_n': sent_n_nb,
                        'pol_nltk': pol_nb,
                        'sub_nltk': sub_nb
                    }
                }
                print rec
                #print shp.validate_record(rec)
                #print shp.validate_record_geometry(rec)
                shp.write(rec)

                c += 1
    except:
        pass
    shp.close()
Пример #33
0
# TextRank, based on:
# http://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf

from itertools import tee, izip
from nltk import stem
from text.blob import TextBlob as tb
from textblob_aptagger import PerceptronTagger
import nltk.data
import numpy as np
import sys

TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle')
TAGGER = PerceptronTagger()
STEMMER = stem.porter.PorterStemmer()


def pos_tag(s):
    """high-performance part-of-speech tagger"""
    global TAGGER
    return TAGGER.tag(s)


def wrap_words(pair):
    """wrap each (word, tag) pair as an object with fully indexed metadata"""
    global STEMMER
    index = pair[0]
    result = []
    for word, tag in pair[1]:
        word = word.lower()
        stem = STEMMER.stem(word)
        if stem == "":