Exemplo n.º 1
0
    def demo_liu_hu_lexicon(sentence):
        tokenizer = treebank.TreebankWordTokenizer()
        pos_words = 0
        neg_words = 0
        tokenized_sent = [
            word.lower() for word in tokenizer.tokenize(sentence)
        ]

        x = list(range(len(tokenized_sent)))  # x axis for the plot
        y = []

        for word in tokenized_sent:
            if word in opinion_lexicon.positive():
                pos_words += 1
                y.append(1)  # positive
            elif word in opinion_lexicon.negative():
                neg_words += 1
                y.append(-1)  # negative
            else:
                y.append(0)  # neutral

        if pos_words > neg_words:
            return 'Positive'
        elif pos_words < neg_words:
            return 'Negative'
        elif pos_words == neg_words:
            return 'Neutral'
Exemplo n.º 2
0
    def predictOpinionAbandoned(self, text):
        """
        Function that predicts whether the given text has a (positive, negative, neutral) opinion
        on the targets.
        :param text: Text that possibly has an opinion on the given targets
        :param targets: The targets on which the opinion has been expressed
        :return: 1: Positive. 0: Neutral. -1: Negative
        """
        tokenizer = treebank.TreebankWordTokenizer()
        pos_words = 1
        neg_words = 1
        tokenized_sent = [word.lower() for word in tokenizer.tokenize(text)]

        y = []

        for word in tokenized_sent:
            if word in self.pos_lexicon:
                pos_words += 1
                y.append(1)  # positive
            elif word in self.neg_lexicon:
                neg_words += 1
                y.append(-1)  # negative
            else:
                y.append(0)  # neutra

        if pos_words / neg_words > self.RATIO:
            print("Support.")
            return 1
        elif neg_words / pos_words > self.RATIO:
            print("Oppose.")
            return -1
        else:
            print("Neutral")
            return 0
Exemplo n.º 3
0
def dlll_pos_neg_ratio(text):
    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(text)]

    x = list(range(len(tokenized_sent)))  # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

    print(pos_words, neg_words)
    ratio = sum(y) / len(y)
    if pos_words > neg_words:
        return ("Positive", ratio)
    elif pos_words < neg_words:
        return ("Negative", ratio)
    elif pos_words == neg_words:
        return ("Neutral", ratio)
Exemplo n.º 4
0
def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent)))  # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

        # if plot == True:
        #     _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])

    return pos_words, neg_words
Exemplo n.º 5
0
def get_nltk_sentiment(sentence, method):
        
    if (method == 'vader'):
        sa = sentiment.vader.SentimentIntensityAnalyzer()
        output = sa.polarity_scores(str(sentence))

        return output['compound']
    
    elif (method == 'liu'):
        
        wordType = ''
        
        if "PERSON" in str(ne_chunk(pos_tag(word_tokenize(sentence)))):
            wordType = 'tag'
        
        tokenizer = treebank.TreebankWordTokenizer()
        pos_words = 0
        neg_words = 0
        tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
        
        for word in tokenized_sent:
            if word in opinion_lexicon.positive():
                pos_words += 1
            elif word in opinion_lexicon.negative():
                neg_words += 1
                
        if pos_words > neg_words:
            return 'Positive'
        elif pos_words < neg_words:
            return 'Negative'
        elif pos_words == neg_words:
            if wordType == 'tag':
                return 'Positive'
            else:
                return 'Neutral'
Exemplo n.º 6
0
def negopinion(sentence):
    tokenizer = treebank.TreebankWordTokenizer()
    neg1 = 0
    tokenized = [word.lower() for word in tokenizer.tokenize(sentence)]
    for word in tokenized:
        if word in opinion_lexicon.negative():
            neg1 += 1
    return neg1
Exemplo n.º 7
0
def posopinion(sentence):
    tokenizer = treebank.TreebankWordTokenizer()
    pos1 = 0
    tokenized = [word.lower() for word in tokenizer.tokenize(sentence)]
    for word in tokenized:
        if word in opinion_lexicon.positive():
            pos1 += 1
    return pos1
Exemplo n.º 8
0
    def run(self):
        self.output().makedirs()
        self.tokenzier = treebank.TreebankWordTokenizer()
        self.stemmer = snowball.SnowballStemmer('english')
        self.vectorizer = CountVectorizer(ngram_range=(1, self.ngram_max),
                                          min_df=self.ngram_min_df)
        train_data = rf_dataset.Dataset().load('train', fold=None, as_df=True)
        test_data = rf_dataset.Dataset().load('test', fold=None, as_df=True)

        all_questions = np.concatenate([
            train_data.question1_clean.values,
            test_data.question1_clean.values,
            train_data.question2_clean.values, test_data.question2_clean.values
        ])

        print(colors.lightblue | 'Tokenizing')
        all_tokens = multiprocessing.Pool(4).map(self.vectorize_question,
                                                 all_questions)
        print(colors.lightblue | 'Finished tokenizing, now fitting')
        transformed_tokens = self.vectorizer.fit_transform(all_tokens)
        print(colors.lightblue | colors.bold | 'Gosh that takes a long time')
        transformed_tokens = transformed_tokens.tocsr()

        halfpt = transformed_tokens.shape[0] // 2
        assert halfpt == train_data.shape[0] + test_data.shape[0]
        q1s = transformed_tokens[:halfpt]
        q2s = transformed_tokens[halfpt:]

        train_q1s = q1s[:train_data.shape[0]]
        train_q2s = q2s[:train_data.shape[0]]
        test_q1s = q1s[train_data.shape[0]:]
        test_q2s = q2s[train_data.shape[0]:]
        nose.tools.assert_equal(test_q1s.shape[0], test_data.shape[0])
        nose.tools.assert_equal(test_q2s.shape[0], test_data.shape[0])
        nose.tools.assert_equal(train_q1s.shape[0], train_data.shape[0])
        nose.tools.assert_equal(train_q2s.shape[0], train_data.shape[0])

        self.write_mat_to(self.make_path('train_q1.pkl'), train_q1s)
        self.write_mat_to(self.make_path('train_q2.pkl'), train_q2s)
        self.write_mat_to(self.make_path('test_q1.pkl'), test_q1s)
        self.write_mat_to(self.make_path('test_q2.pkl'), test_q2s)

        diffs = sp.hstack([np.abs(q1s - q2s), q1s.multiply(q2s)]).tocsr()

        train_vecs = diffs[:train_data.shape[0]]
        test_vecs = diffs[train_data.shape[0]:]
        nose.tools.assert_equal(train_vecs.shape[0], train_data.shape[0])
        nose.tools.assert_equal(test_vecs.shape[0], test_data.shape[0])

        self.write_mat_to(self.make_path('train_mat.pkl'), train_vecs)
        self.write_mat_to(self.make_path('test_mat.pkl'), test_vecs)

        with self.output().open('w'):
            pass
Exemplo n.º 9
0
def idf_embedding(data):
    idf_corpus = []
    tok = treebank.TreebankWordTokenizer()

    for i, c in enumerate(data):
        for u in c.utterances:
            idf_corpus.append(c.title)
            idf_corpus.append(u.utterance)

    vectorizer = TfidfVectorizer(tokenizer=tok.tokenize, stop_words='english')
    vectorizer.fit(idf_corpus)
    return vectorizer
Exemplo n.º 10
0
def getPositiveWords(sentence):
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list()  # x axis for the plot

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            x.append(word)
    return x
Exemplo n.º 11
0
    def run(self):
        self.output().makedirs()
        tqdm.pandas(tqdm)
        self.tokenzier = treebank.TreebankWordTokenizer()
        self.stemmer = snowball.SnowballStemmer('english')
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=50)
        train, merge, valid = Dataset().load()

        logging.info('Vectorizing train')
        train_mat, q1, q2 = self.fit(train)
        train_cols = train_mat.shape[1]
        train_q1_cols, train_q2_cols = q1.shape[1], q2.shape[1]
        scipy.io.mmwrite('cache/tfidf/train.mtx', train_mat)
        scipy.io.mmwrite('cache/tfidf/train_q1.mtx', q1)
        scipy.io.mmwrite('cache/tfidf/train_q2.mtx', q2)
        del train, train_mat

        logging.info('Vectorizing valid')
        valid_mat, q1, q2 = self.transform(valid)
        assert valid_mat.shape[1] == train_cols
        assert q1.shape[1] == train_q1_cols and q2.shape[1] == train_q2_cols
        scipy.io.mmwrite('cache/tfidf/valid.mtx', valid_mat)
        scipy.io.mmwrite('cache/tfidf/valid_q1.mtx', q1)
        scipy.io.mmwrite('cache/tfidf/valid_q2.mtx', q2)
        del valid, valid_mat

        logging.info('Vectorizing merge')
        merge_mat, q1, q2 = self.transform(merge)
        assert merge_mat.shape[1] == train_cols
        assert q1.shape[1] == train_q1_cols and q2.shape[1] == train_q2_cols
        scipy.io.mmwrite('cache/tfidf/merge.mtx', merge_mat)
        scipy.io.mmwrite('cache/tfidf/merge_q1.mtx', q1)
        scipy.io.mmwrite('cache/tfidf/merge_q2.mtx', q2)
        del merge, merge_mat

        logging.info('Vectorizing test')
        test = Dataset().load_test()
        test_mat, q1, q2 = self.transform(test)
        assert test_mat.shape[1] == train_cols
        assert q1.shape[1] == train_q1_cols and q2.shape[1] == train_q2_cols
        scipy.io.mmwrite('cache/tfidf/test.mtx', test_mat)
        scipy.io.mmwrite('cache/tfidf/test_q1.mtx', q1)
        scipy.io.mmwrite('cache/tfidf/test_q2.mtx', q2)

        assert self.load_dataset('test').shape[1] == train_cols

        with self.output().open('w') as f:
            pass
Exemplo n.º 12
0
def combine_sentimental(conversations):
    x = []
    conv_count = len(conversations)
    analyzer = SentimentIntensityAnalyzer()
    tok = treebank.TreebankWordTokenizer()

    for i, c in enumerate(conversations):
        for u in c.utterances:
            x.append([
                thank(u),
                e_mark(u),
                feedback(u), *sentiment_score(analyzer, u),
                *opinion_lex(tok, u)
            ])
        print('\r>>>> {}/{} done...'.format((i + 1), conv_count), end='')
    return np.asarray(x)
Exemplo n.º 13
0
def acisWordAnalysis(sentence):
    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
        elif word in opinion_lexicon.negative():
            neg_words += 1

    if pos_words > neg_words:
        return "Positive"
    elif pos_words < neg_words:
        return "Negative"
    elif pos_words == neg_words:
        return "Neutral"
Exemplo n.º 14
0
    def run(self):
        self.tokenzier = treebank.TreebankWordTokenizer()
        self.kvecs = gensim.models.KeyedVectors.load_word2vec_format(w2v_file)

        train_data = rf_dataset.Dataset().load_all(
            'train', as_df=True)[['question1_clean', 'question2_clean']]
        test_data = rf_dataset.Dataset().load_all(
            'test', as_df=True)[['question1_clean', 'question2_clean']]

        all_data = pandas.concat([train_data, test_data], 0)

        distances = list(
            tqdm(multiprocessing.Pool().imap(self.vectorize,
                                             zip(all_data['question1_clean'],
                                                 all_data['question2_clean']),
                                             chunksize=50_000),
                 total=all_data.shape[0],
                 desc='vectorizing the words'))
Exemplo n.º 15
0
    def treebank_tokenizer(self, review):
        tokenizer = treebank.TreebankWordTokenizer()
        if self.features in [1, 2]:
            tokens = [
                process_word(word.lower())
                for word in tokenizer.tokenize(self.data[review]['Content'])
            ]
        else:
            tokens = [
                word.lower()
                for word in tokenizer.tokenize(self.data[review]['Content'])
            ]

        tags = nltk.pos_tag(tokens)

        if self.features in [2, 3]:
            tags = self.ngrams(tokens, tags)

        return tokens, tags
Exemplo n.º 16
0
def simple_sentiment(text):
    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0

    tokenized_sent = [word.lower() for word in tokenizer.tokenize(text)]

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
        elif word in opinion_lexicon.negative():
            neg_words += 1

    if pos_words > neg_words:
        return 'Positive'
    elif pos_words < neg_words:
        return 'Negative'
    elif pos_words == neg_words:
        return 'Neutral'
Exemplo n.º 17
0
    def run(self):
        self.output().makedirs()
        kvecs = gensim.models.KeyedVectors.load_word2vec_format(w2v_file)
        train_dataset = rf_dataset.Dataset().load_all('train', as_df=True)
        test_dataset = rf_dataset.Dataset().load_all('test', as_df=True)
        self.tokenzier = treebank.TreebankWordTokenizer()

        all_words = pandas.concat([
            train_dataset.question1_clean.str.lower(),
            train_dataset.question2_clean.str.lower(),
            test_dataset.question1_clean.str.lower(),
            test_dataset.question2_clean.str.lower(),
        ])

        tokenizer = Tokenizer(num_words=250_000)
        tokenizer.fit_on_texts(all_words)
        all_seqs = tokenizer.texts_to_sequences(all_words)
        all_padded_seqs = pad_sequences(all_seqs, 32)

        train_seqs = all_padded_seqs[:train_dataset.shape[0] * 2]
        test_seqs = all_padded_seqs[train_dataset.shape[0] * 2:]
        nose.tools.assert_equal(test_seqs.shape[0], test_dataset.shape[0] * 2)

        train_q1 = train_seqs[:train_dataset.shape[0]]
        train_q2 = train_seqs[train_dataset.shape[0]:]
        test_q1 = test_seqs[:test_dataset.shape[0]]
        test_q2 = test_seqs[test_dataset.shape[0]:]

        np.savez_compressed(self.make_path('train.npz'),
                            q1=train_q1,
                            q2=train_q2)
        np.savez_compressed(self.make_path('test.npz'), q1=test_q1, q2=test_q2)

        embedding_matrix = np.zeros((250_000, 300))
        for word, ix in tokenizer.word_index.items():
            if word in kvecs:
                embedding_matrix[ix, :] = kvecs[word]
        np.savez_compressed(self.make_path('embedding.npz'),
                            data=embedding_matrix)

        with self.output().open('w'):
            pass
Exemplo n.º 18
0
    def __init__(self, lemmatize=lambda x:x, boundaries=False, 
                 **kwargs):
        try:
            import nltk.tokenize.treebank as tb
            self.tokenize = tb.TreebankWordTokenizer().tokenize
        except ImportError:
            print "Could not import NLTK tokenizer. Tokenizing on space instead"
            self.tokenize = lambda x : x.split(" ")
        self.lemmatize = lemmatize
        self.boundaries = boundaries
        if 'preLemmatized' in kwargs:
            def get_lemma(x):
                x = x.split(kwargs['preLemmatized'])
                return x[-1]
            def get_lemmata(xs):
                return [get_lemma(x) for x in xs]
            self.lemmatize = get_lemmata


        self._set_parameters(**kwargs)
Exemplo n.º 19
0
def getPolarity_lex(sentence):
    """
    Polarity of the sentences, conventional Liu and Hu Opinion Lexicon
    Takes in a sentence and returns the sentiment of the sentence by counting the no of positive and negitive
    and negitive words and by reversing the sentiment if the words NO or NOT are present
    """
    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent)))
    y = []
    isNegation = False
    negationWords = [
        'no', 'not', 'never', 'none', 'hardly', 'rarely', 'scarcely', ''
    ]

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

        if word in negationWords:
            isNegation = True

    if pos_words > neg_words and isNegation == True:
        return 'neg'
    elif pos_words > neg_words:
        return 'pos'
    elif pos_words < neg_words and isNegation == True:
        return 'pos'
    elif pos_words < neg_words:
        return 'neg'
    elif pos_words == neg_words:
        return 'neutral'
Exemplo n.º 20
0
    def run(self):
        self.output().makedirs()
        tqdm.pandas(tqdm)
        self.tokenzier = treebank.TreebankWordTokenizer()
        self.stemmer = snowball.SnowballStemmer('english')
        self.vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=50)
        train, merge, valid = Dataset().load()

        logging.info('Vectorizing train')
        train_mat, q1, q2 = self.fit(train)
        scipy.io.mmwrite('cache/count/train.mtx', train_mat)
        scipy.io.mmwrite('cache/count/train_q1.mtx', q1)
        scipy.io.mmwrite('cache/count/train_q2.mtx', q2)
        del train, train_mat

        logging.info('Vectorizing valid')
        valid_mat, q1, q2 = self.transform(valid)
        scipy.io.mmwrite('cache/count/valid.mtx', valid_mat)
        scipy.io.mmwrite('cache/count/valid_q1.mtx', q1)
        scipy.io.mmwrite('cache/count/valid_q2.mtx', q2)

        del valid, valid_mat

        logging.info('Vectorizing merge')
        merge_mat, q1, q2 = self.transform(merge)
        scipy.io.mmwrite('cache/count/merge.mtx', merge_mat)
        scipy.io.mmwrite('cache/count/merge_q1.mtx', q1)
        scipy.io.mmwrite('cache/count/merge_q2.mtx', q2)
        del merge, merge_mat

        logging.info('Vectorizing test')
        test = Dataset().load_test()
        test_mat, q1, q2 = self.transform(test)
        scipy.io.mmwrite('cache/count/test.mtx', test_mat)
        scipy.io.mmwrite('cache/count/test_q1.mtx', q1)
        scipy.io.mmwrite('cache/count/test_q2.mtx', q2)

        with self.output().open('w') as f:
            pass
def liu_hu_opinion_lexicon(sentence: str) -> str:
    '''
    Modified version of the Liu Hu opinion lexicon algorithm for sentiment
    analysis on sentences.
    Reference: https://www.nltk.org/_modules/nltk/sentiment/util.html#demo_liu_hu_lexicon
    
    The function has been modified to return the values instead of printing.
    
    Returns:
    --------
    Sentiment of a sentence, classified as 'Positive', 'Negative' or 'Neutral'
    '''

    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words, neg_words = 0, 0
    y = []
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

    if pos_words > neg_words:
        return ('Positive')
    elif pos_words < neg_words:
        return ('Negative')
    elif pos_words == neg_words:
        return ('Neutral')
Exemplo n.º 22
0
def tokenize_tree():

    tokenizer = treebank.TreebankWordTokenizer()

    def tokenize_edu(edu_node):
        if edu_node.text:
            edu_node.text = tokenizer.tokenize(edu_node.text,
                                               convert_parentheses=True,
                                               return_str=True)

    for rstf in glob.glob(FLAGS.rst_path + "/*.dis"):
        if rstf.endswith("dis"):
            basename = rstf.rsplit("/", 1)[1].split(".")[0]
            if basename.startswith("wsj"):
                print >> logs, "Tokenizing", basename
                rstlines = " ".join(
                    [line.strip() for line in open(rstf).readlines()])
                rstt = RSTTree.parse(rstlines)

                rstt.postorder_visit(tokenize_edu)

                tgtfile = FLAGS.rst_path + "/" + basename + ".out.dis.tok"
                prettystr = rstt.pretty_str() + "\n"
                open(tgtfile, "w").write(prettystr)
Exemplo n.º 23
0
def tokenize_by_treebank_word(text):
    tokenizer = treebank.TreebankWordTokenizer()
    return tokenizer.tokenize(text)
Exemplo n.º 24
0
    def run(self):
        global _independent_transformers

        self.tokenzier = treebank.TreebankWordTokenizer()
        self.stemmer = snowball.SnowballStemmer('english')

        train_data = rf_dataset.Dataset().load_all(
            'train', as_df=True)[['question1_clean', 'question2_clean']]
        test_data = rf_dataset.Dataset().load_all(
            'test', as_df=True)[['question1_clean', 'question2_clean']]

        all_data = pandas.concat([train_data, test_data], 0)
        all_q1 = list(all_data['question1_clean'])
        all_t1 = list(
            tqdm(multiprocessing.Pool().imap(self.tokenize,
                                             all_q1,
                                             chunksize=5000),
                 total=len(all_q1),
                 desc='Tokenizing: 1'))

        all_q2 = list(all_data['question2_clean'])
        all_t2 = list(
            tqdm(multiprocessing.Pool().imap(self.tokenize,
                                             all_q2,
                                             chunksize=5000),
                 total=len(all_q2),
                 desc='Tokenizing: 2'))

        all_indep_dists = list(
            tqdm(multiprocessing.Pool().imap(transform,
                                             zip(all_q1, all_q2, all_t1,
                                                 all_t2),
                                             chunksize=5000),
                 total=len(all_q1),
                 desc='Computing distances'))
        all_df = pandas.DataFrame(all_indep_dists)

        print('Loading dependent transforms')
        dependent_transformers = {
            'word_mover': WordMoverDistance(),
            'sentiment': SentimentDifference()
        }
        print('Finished loading!')

        for name, fn in dependent_transformers.items():
            dist = [
                fn(q1, q2, t1, t2)
                for q1, q2, t1, t2 in tqdm(zip(all_q1, all_q2, all_t1, all_t2),
                                           total=len(all_q1),
                                           desc=name)
            ]
            if isinstance(dist[0], dict):
                frame = pandas.DataFrame.from_dict(dist, orient='columns')
                for col in frame:
                    all_df[name + '_' + col] = frame[col]
            else:
                all_df[name] = dist

        self.output().makedirs()
        train_dists = all_df.iloc[:train_data.shape[0]]
        test_dists = all_df.iloc[train_data.shape[0]:]
        train_dists.to_msgpack(_train_loc)
        test_dists.to_msgpack(_test_loc)

        little_cls = ensemble.ExtraTreesClassifier(n_estimators=200, n_jobs=-1)
        little_cls.fit(
            train_dists.clip(-10000, 10000).values,
            rf_dataset.Dataset().load_all('train',
                                          as_df=True).is_duplicate.values)
        print(
            pandas.Series(little_cls.feature_importances_,
                          train_dists.columns).sort_values())

        with self.output().open('w') as f:
            f.write(
                str(
                    pandas.Series(little_cls.feature_importances_,
                                  train_dists.columns).sort_values()))
            f.write("\n")
Exemplo n.º 25
0
def sentiment(body):
    from nltk.tokenize import treebank
    stripped = Markup(body).striptags()
    tokenizer = treebank.TreebankWordTokenizer()
    return sum([word_score.get(word.lower(), 0) for word in tokenizer.tokenize(stripped)])
Exemplo n.º 26
0
print(s1_comments)

s1_comments = s1_comments.lower()
words = s1_comments.split()

letters_only = re.sub("[^a-zA-Z]", " ", l)

stops = set(stopwords.words("english"))

meaningful_words = [w for w in words if not w in stops]

print(meaningful_words)

sentence = s1_comments
sentence = ''.join(s1_comments)
tokenizer = treebank.TreebankWordTokenizer()
pos_words = 0
neg_words = 0
tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

x = list(range(len(tokenized_sent)))  # x axis for the plot
y = []

for word in tokenized_sent:
    if word in opinion_lexicon.positive():
        pos_words += 1
        y.append(1)  # positive
    elif word in opinion_lexicon.negative():
        neg_words += 1
        y.append(-1)  # negative
    else:
Exemplo n.º 27
0
def neuopinion(sentence):
    tokenizer = treebank.TreebankWordTokenizer()
    tokenized = [word.lower() for word in tokenizer.tokenize(sentence)]
    total = len(tokenized)
    return total
Exemplo n.º 28
0
 def __init__(self):
     self.pos = set(opinion_lexicon.positive())
     self.neg = set(opinion_lexicon.negative())
     self.tok = treebank.TreebankWordTokenizer()