示例#1
0
文件: views.py 项目: aczapata/twitter
def lexicon_tweet(tweet):
    SWN = sentlex.SWN3Lexicon()
    classifier = sentlex.sentanalysis.BasicDocSentiScore()
    classifier.classify_document(tweet,
                                 tagged=False,
                                 L=SWN,
                                 a=True,
                                 v=True,
                                 n=True,
                                 r=False,
                                 negation=True,
                                 verbose=False)
    results = classifier.resultdata
    results_pos = results['resultpos']
    results_neg = results['resultneg']

    if results_pos == 0 and results_neg == 0:
        sentiment = 'irrelevant'
    else:
        dif = abs(results_pos - results_neg)
        if dif < 0.05:
            sentiment = 'neutral'
        else:
            if results_pos > results_neg:
                sentiment = 'positive'
            else:
                sentiment = 'negative'
    return sentiment
示例#2
0
    def setUp(self):
        self.L1 = sentlex.UICLexicon()
        self.L2 = sentlex.SWN3Lexicon()
        self.L = sentlex.CompositeLexicon()

        self.L.add_lexicon(self.L1)
        self.L.add_lexicon(self.L2)
        self.L.compile_frequency()
示例#3
0
def comp():
    L1 = sentlex.UICLexicon()
    L2 = sentlex.SWN3Lexicon()
    L = sentlex.CompositeLexicon()

    L.add_lexicon(L1)
    L.add_lexicon(L2)
    L.compile_frequency()
    return L
示例#4
0
    def runTest(self):
        L = sentlex.SWN3Lexicon()
        L.compile_frequency()

        baseline = [('bad', 0.0005451764705882353),
                    ('good', 0.002610137254901961),
                    ('the', 0.029449176470588236),
                    ('want', 0.0027591764705882354)]

        for (w, f) in baseline:
            self.assertTrue(
                L.get_freq(w) == f,
                'Incorrect freq found for %s (%.8f <> %.8f)' %
                (w, f, L.get_freq(w)))
示例#5
0
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
from loadTuples import load, load2, load3
from sklearn import svm
from evalt import *
from collections import Counter
import sentlex

test_sents = load3("test")
#print train_sents
#print "sent =" +str(len(train_sents))
SWN = sentlex.SWN3Lexicon()

f = open("PredictedTags.pkl", 'rb')
Eventpredicted = pickle.load(f)
f.close()

global wordCnt

wordCnt = -1


def word2features(sent, i):
    """get the feautes corresponding to a word in a sentence at a particular position
    Args:
        sent: the sentence whose word is to be considered
        i: the position of the word in the sentence
示例#6
0
    def transform(self, documents):
        import enchant
        import sentlex
        from feature_extraction import tokenize_document

        d = enchant.Dict("en_US")
        swn = sentlex.SWN3Lexicon()
        tokenized_documents = [tokenize_document(document) for document in documents]
        n_words = []
        n_chars = []
        # number of uppercase words
        all_caps = []
        n_bad = []
        exclamation = []
        addressing = []

        n_dwords = [sum(1 for word in document if d.check(word)) for document in tokenized_documents]

        sent_pos = []
        sent_neg = []
        n_you_re = []
        for comment in documents:
            n_words.append(len(comment.split()))
            n_chars.append(len(comment))
            all_caps.append(np.sum([w.isupper() for w in comment.split()]))
            n_bad.append(comment.count('fakeinsult'))
            exclamation.append(comment.count("!"))
            addressing.append(comment.count("@"))
            doc = nlp(comment)
            count = 0.
            pos_sum = 0.
            neg_sum = 0.
            for token in doc:
                if token.text == 'fakeinsult':
                    pos_sum += 0.
                    neg_sum += 1.
                    count += 1.
                    continue
                if token.pos_.startswith('RB'):
                    sentiment = swn.getadverb(token.text)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                elif token.pos_.startswith('NN'):
                    sentiment = swn.getnoun(token.text)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                if token.pos_.startswith('JJ'):
                    sentiment = swn.getadjective(token.text)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                if token.pos_.startswith('VB'):
                    sentiment = swn.getverb(token.text)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
            if count != 0:
                pos_sum /= count
                neg_sum /= count
            sent_neg.append(neg_sum)
            sent_pos.append(pos_sum)
            matches = self.__matcher(doc)
            n_you_re.append(len(matches))

        allcaps_ratio = np.array(all_caps) / np.array(n_words, dtype=np.float)
        bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float)
        dic_ratio = np.array(n_dwords) / np.array(n_words, dtype=np.float)

        return np.array([n_words, n_chars, n_dwords, n_you_re, exclamation, all_caps,
                         addressing, bad_ratio, n_bad, allcaps_ratio, dic_ratio,
                         sent_pos]).T
示例#7
0
    def transform(self, documents):
        import enchant
        import re
        import sentlex
        from pattern.en import tag as tagger

        d = enchant.Dict("en_US")
        SWN = sentlex.SWN3Lexicon()
        from feature_extraction import tokenize_document
        tokenized_documents = [
            tokenize_document(document) for document in documents
        ]
        n_words = [len(c.split()) for c in documents]
        #n_words = [len(document) for document in tokenized_documents]
        n_chars = [len(c) for c in documents]
        n_dwords = [
            sum(1 for word in document if d.check(word))
            for document in tokenized_documents
        ]

        sent_pos = []
        sent_neg = []
        for comment in documents:
            count = 0.
            pos_sum = 0.
            neg_sum = 0.
            for word, tag in tagger(comment.lower()):
                if word == 'fakeinsult':
                    pos_sum += 0.
                    neg_sum += 1.
                    count += 1.
                    continue
                if tag.startswith('RB'):
                    sentiment = SWN.getadverb(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                elif tag.startswith('NN'):
                    sentiment = SWN.getnoun(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                if tag.startswith('JJ'):
                    sentiment = SWN.getadjective(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                if tag.startswith('VB'):
                    sentiment = SWN.getverb(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
            if count != 0:
                pos_sum /= count
                neg_sum /= count
            sent_neg.append(neg_sum)
            sent_pos.append(pos_sum)

        n_you_re = [
            len(re.findall(self.__you_re, document)) for document in documents
        ]
        n_you = [
            len(re.findall(self.__you, document)) for document in documents
        ]

        # number of uppercase words
        allcaps = [
            np.sum([w.isupper() for w in comment.split()])
            for comment in documents
        ]
        # longest word
        #max_word_len = [np.max([len(w) for w in c.split()]) for c in documents]
        # average word length
        #mean_word_len = [np.mean([len(w) for w in c.split()])
        #                                    for c in documents]
        # number badwords:
        n_bad = [
            np.sum([c.lower().count(w) for w in self.__badwords])
            for c in documents
        ]
        exclamation = [c.count("!") for c in documents]
        addressing = [c.count("@") for c in documents]

        allcaps_ratio = np.array(allcaps) / np.array(n_words, dtype=np.float)
        bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float)
        dic_ratio = np.array(n_dwords) / np.array(n_words, dtype=np.float)

        return np.array([
            n_words, n_chars, n_dwords, n_you_re, n_you, exclamation, allcaps,
            addressing, bad_ratio, n_bad, allcaps_ratio, dic_ratio, sent_pos
        ]).T
示例#8
0
def swn3():
    L = sentlex.SWN3Lexicon()
    L.compile_frequency()
    return L