Пример #1
0
 def get_tweet_JsonFiles(self, json_file2: None):
     if (json_file2 == None):
         all_tweets_samples = twitter_samples.fileids()
         json_file = all_tweet_samples[2]  #json file
         tweet_string = twitter_samples.strings(json_file)
         return tweet_string
     tweet_string = json_file2
     return tweet_string
Пример #2
0
 def test_corpus_twitter_method_returns_correct_result(self):
     self.assertEqual(twitter_samples.fileids(), [
         'negative_tweets.json', 'positive_tweets.json',
         'tweets.20150430-223406.json'
     ])
     self.assertEqual(
         twitter_samples.strings('negative_tweets.json')[0],
         'hopeless for tmr :(')
     self.assertEqual(
         twitter_samples.strings('positive_tweets.json')[0],
         '#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'
     )
Пример #3
0
 def validate(self,classifier):
     """Test the accuracy of a given classifier against a test dataset with labels.
     Args:
         classifier: (Bayesain,DecisionTree,SVC,LinearSVC) for use in classifying data
     Returns:
         None
     """
     tweets =  twitter_samples.fileids()
     pos_tweets = twitter_samples.tokenized(tweets[1])
     neg_tweets = twitter_samples.tokenized(tweets[0])
     pos_testing = pos_tweets[(len(pos_tweets)*7/8):]
     neg_testing = neg_tweets[(len(neg_tweets)*7/8):]
     pos_test  = [(self.train_feats(f), 'positive') for f in pos_testing ]
     neg_test = [(self.train_feats(f), 'negative') for f in neg_testing ]
     testfeats = pos_test + neg_test
     print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100)
Пример #4
0
 def parseTweets(self):
     """Parses tweets and extracts features from it
     Args:
         none
     Returns:
         extract_feats: list of words with frequency of each word occuring in both positive and negative classes.
     """
     tweets = twitter_samples.fileids()
     pos_tweets = twitter_samples.tokenized(tweets[1])
     neg_tweets = twitter_samples.tokenized(tweets[0])
     pos_training = pos_tweets[:(len(pos_tweets) * 7 / 8)]
     neg_training = neg_tweets[:(len(pos_tweets) * 7 / 8)]
     pos_feats = [(self.extract_feats(f), 'positive') for f in pos_training]
     neg_feats = [(self.extract_feats(f), 'negative') for f in neg_training]
     train_feats = pos_feats + neg_feats
     print '[-] Feature Extraction Finished'
     return train_feats
nltk.corpus, stopwords from nltk.corpus, PorterStemmer from nltk.stem.porter,
ngrams from nltk, and re. It then imports a corpus of sample tweets about
Brexit. The tweets are cleaned to remove special characters, hashtags,
and twitter user IDs. The tweet text is then cleaned, tokenized, 
and stemmed. Finally, we compute frequency distributions to try to
determine the most frequently used messages in the tweets. 
'''

import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import ngrams
import re

twitter_samples.fileids()
tweets = twitter_samples.strings(twitter_samples.fileids()[-1])
porter_stemmer = PorterStemmer()

def clean_text(tweet):         
	tweet 	= tweet.strip()   
	pattern = "(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT)"      
	cleaned_tweet = ' '.join(re.sub(pattern," ",tweet).split())
	wordTokens = nltk.word_tokenize(cleaned_tweet)
	wordTokens = [token.lower() for token in wordTokens if len(token)>1]
	stops = set(stopwords.words("english"))
	wordTokens = [token for token in wordTokens if token not in stops]
	cleaned_tweet = ' '.join(wordTokens)
	return cleaned_tweet

# Remove special characters, stopwords, twitter IDs, and hashtags.
Пример #6
0
lm_classifier = lmModel.fit(training_features, training_labels)
predictions = lm_classifier.predict(test_features)
print("Precision of LinearRegression is")
precision = calculate_precision(predictions, test_gold_labels)
print("Test data\t" + str(precision))
#Real time tesing
real_time_test(lm_classifier, vocabulary_mv)

###twitter sentiment analyzer
import re

nltk.download('words')
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
ts.fileids()
twitt_str = ts.strings('tweets.20150430-223406.json')
twitt_token = ts.tokenized('tweets.20150430-223406.json')
#print(twitt_str)
print(len(twitt_token))

data_words = nltk.word_tokenize(str(twitt_str))

data_words = [
    data_words.lower() for data_words in data_words if data_words.isalpha()
]
stemmed = [porter.stem(data_word) for data_word in data_words]
print(stemmed)
print(len(stemmed))

wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
# -*- coding: utf-8 -*-
"""
Created on Sat Oct  1 17:09:48 2016

@author: Admin
"""
import nltk

from nltk.corpus import twitter_samples
twitter_samples.fileids()
"""Accessing json file of positive tweets"""
positive = nltk.corpus.twitter_samples.raw("positive_tweets.json")
positive.__str__()

postwts = nltk.word_tokenize(positive)
"""Length of all the positive tweets"""
len(set(postwts))

from nltk.corpus import twitter_samples
twitter_samples.fileids()
"""Accessing json file of negative tweets"""
tweet = twitter_samples.raw("negative_tweets.json")
tweet.__str__()

tokens = nltk.word_tokenize(tweet)
"""Length of all the negative tweets"""
len(set(tokens))
Пример #8
0
#! /usr/bin/env python

from nltk.corpus import twitter_samples
import json
import csv

print twitter_samples.fileids()

pos_output_file = "pos_tweets_list.txt"
neg_output_file = "neg_tweets_list.txt"


def clean_up_files(filename):
    data = list()
    with open(filename, 'r') as f:
        for line in f:
            if len(line) > 1:
                data.append(line)

    with open(filename, 'w') as f:
        for line in data:
            f.write(line)


pos_tweets_file = twitter_samples.abspath(twitter_samples.fileids()[1])
pos_tweets_output = open(pos_output_file, 'w+')
with open(pos_tweets_file, 'r') as tf:
    for line in tf:
        x = json.loads(line)
        tweet = x['text'].encode('UTF-8')
        if '\n' not in tweet:
def twitterClass():
    global wordFeatures
    tknzr = TweetTokenizer(strip_handles=True)
    onlyWords = re.compile('^[a-zA-Z]+$')
    # print
    if not os.path.exists(os.path.join(os.getcwd(), 'semtiment_classifier.pickle')):
        print twitter_samples.fileids()
        # print movie_reviews.fileids()
        # print

        tknzr = TweetTokenizer(strip_handles=True)
        onlyWords = re.compile('^[a-zA-Z]+$')
        labeledTweets = []

        for it in twitter_samples.docs('negative_tweets.json'):
            tokens = []
            for token in tknzr.tokenize(it['text']):
                if onlyWords.match(token) is not None:
                    tokens.append(token.lower())
            labeledTweets.append((tokens, "negative"))
            # print [token for token in tknzr.tokenize(it['text']) if onlyWords.match(token) is not None]

        for it in twitter_samples.docs('positive_tweets.json'):
            tokens = []
            for token in tknzr.tokenize(it['text']):
                if onlyWords.match(token) is not None:
                    tokens.append(token.lower())
            labeledTweets.append((tokens, "positive"))

        # print  labeledTweets
        wordFeatures = get_word_features(get_words_in_tweets(labeledTweets))
        print "training"
        training = classUtil.apply_features(extract_features, labeledTweets)
        # print training

        sentimentClassifier = NaiveBayesClassifier.train(training)
        print "done training"
        f = open('semtiment_classifier.pickle', 'wb')
        pickle.dump(sentimentClassifier, f)
        f.close()
    else:
        fin = open('wordFeatures.json', "r")
        wordFeatures = json.load(fin)
        fin.close()
        print wordFeatures
        f = open('semtiment_classifier.pickle', 'rb')
        classifier = pickle.load(f)  # type: nltk.classify.naivebayes.NaiveBayesClassifier
        f.close()
        # text,created_at
        tweets = []

        onlyWords = re.compile('^[a-zA-Z]+$')
        labeledTweets = []
        for row in csv.DictReader(open('datafiles/trump.csv')):
            text = row['text']
            features = []
            for token in tknzr.tokenize(text):
                if onlyWords.match(token) is not None:
                    features.append(token.lower())
            print row['created_at']
            tweets.append({
                "created_at": row['created_at'],
                "text": text,
                "classification": classifier.classify(extract_features(features))
            })
        classification = open('trumpClassified.json', 'w+')
        classification.write(json.dumps(tweets, indent=2))
        classification.close()
        tweets = []
        labeledTweets = []
        for row in csv.DictReader(open('datafiles/clinton.csv')):
            text = row['text']
            features = []
            for token in tknzr.tokenize(text):
                if onlyWords.match(token) is not None:
                    features.append(token.lower())
            print row['created_at']
            tweets.append({
                "created_at": row['created_at'],
                "text": text,
                "classification": classifier.classify(extract_features(features))
            })
        classification = open('clintonClassified.json', 'w+')
        classification.write(json.dumps(tweets, indent=2))
        classification.close()
Пример #10
0
import nltk
from nltk.corpus import twitter_samples

nltk.download('twitter_samples')

print("Files: ", twitter_samples.fileids())

tweets = twitter_samples.strings('tweets.20150430-223406.json')
print("Total tweets: ", len(tweets))
for tweet in tweets[:10]:
    print(tweet)
Пример #11
0
from nltk.corpus import twitter_samples, stopwords
from nltk.tokenize import word_tokenize
import re
from collections import Counter
import plotly.plotly as py
import plotly.graph_objs as go


print twitter_samples.fileids()
stop_words = set(stopwords.words('english'))
negTweets = twitter_samples.strings('negative_tweets.json')
posTweets = twitter_samples.strings('positive_tweets.json')
negWords = []
posWords = []
for tweet in posTweets:
	[posWords.append(x) for x in [w for w in word_tokenize(re.sub(r'RT |@\S*|#\S+|http\S+|\n-|w/|[\.]{2,}','',tweet)) if not w in stop_words]]
for tweet in negTweets:
	[negWords.append(x) for x in [w for w in word_tokenize(re.sub(r'RT |@\S*|#\S+|http\S+|\n-|w/|[\.]{2,}','',tweet)) if not w in stop_words]]

results = {}
posWords=Counter(posWords)
negWords=Counter(negWords)
for word in posWords:
	if word in negWords:
		results[word] = posWords[word] - negWords[word]
	else:
		results[word] = posWords[word]
for word in negWords:
	if not word in results:
		results[word] = 0 - negWords[word]
Пример #12
0
from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter, credsfromfile
from nltk.corpus import twitter_samples
#retrieving twitter data and interfacing with API
oauth = credsfromfile()
#tw = Twitter()
#tw.tweets(to_screen=False, limit=25)

#sample public twitter stream
#client = Streamer(**oauth)
#client.register(TweetViewer(limit=10))
#client.sample()

#client = Query(**oauth)
#tweets = client.search_tweets(keywords='nltk', limit = 50)
#tweet = next(tweets)
#from pprint import pprint
#pprint(tweet, depth = 1)

print twitter_samples.fileids()

strings = twitter_samples.strings('tweets.20150430-223406.json')
for string in strings[:15]:
    print(string)
Пример #13
0
 def __init__(self):
     self.number_id = 41
     self.source_id = "twitter_samples"
     self.titles = [name for name in twitter_samples.fileids()]
     self.data = [twitter_samples.raw(name) for name in self.titles]
Пример #14
0
from nltk.corpus import twitter_samples
from nltk.twitter.common import json2csv

#corpus twitter_sample tweets ~20k
jsonfile = twitter_samples.fileids()[-1]

#absolute path for the file: #input_file = os.path.abspath(jsonfile)=>returns virtualenv file path
input_file = twitter_samples.abspath(jsonfile) #returns system /usr/share/ path

#with open(input_file) as fp:
	#json2csv(fp,'tweets_text.csv',['text']) #json2csv(pointer, nameoffile, [feature1,feature2,feature3])

#think about the attributes to be imported, convert to panda, make a dataframe, apply stemming to tweet texts, save them. 
with open(input_file) as fp:
	json2csv(fp, 'tweets_dataframe.csv',['id','text','user.favourites_count','user.id','lang','user.followers_count','user.verified','truncated'])
#json, csv
Пример #15
0
from nltk.corpus import twitter_samples, TwitterCorpusReader
import numpy as np
import matplotlib.pyplot as plt
from NBClassifier import NBClassifier
from SCClassifier import SCClassifier
from BGClassifier import BGClassifier
from sklearn.metrics import roc_curve, auc
import os
import pickle

# settings
fileIds = twitter_samples.fileids()
root = twitter_samples.root

negReader = TwitterCorpusReader(root, fileIds[0])
negTwt = []
posReader = TwitterCorpusReader(root, fileIds[1])
posTwt = []
for tweet in negReader.docs():
    negTwt.append((tweet['text']))
for tweet in posReader.docs():
    posTwt.append((tweet['text']))

posInd = np.random.permutation(len(posTwt))
negInd = np.random.permutation(len(negTwt))

X_1 = np.array([])
X_2 = np.array([])
X_3 = np.array([])
Y = np.array([])
NB_auc = np.zeros((5, 1))
Пример #16
0
        tokenTweet = tknzr.tokenize(tweet)
        j = 0
        k = 0
        while j < (len(tokenTweet) - k):
            #print j
            if tokenTweet[j][0] == "#":
                tokenTweet[j] = tokenTweet[j][1:]
            elif tokenTweet[j][0] == "@":
                del tokenTweet[j]
                j-=1
                k+=1
            j+=1
            
        info.append((word_feats(tokenTweet), classification))

ids = twitter_samples.fileids()
neg = 0
pos = 1
negtweets = "negtweets.txt"
postweets = "postweets.txt"

#tags unused
neginfo = []
posinfo = []
negtags = []
postags = []
getTweetTokens('neg', ids[0], neginfo, negtags)
getTweetTokens('pos', ids[1], posinfo, postags)

##for i in range(2):
##    print str(posinfo[i])
Пример #17
0
'''
This is sentiment analysis from nltk samples and corpora
Twitter samples will be used as data
'''

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
import random

# Collect data
print(twitter_samples.fileids())


def create_word_features(words):
    useful_words = [
        word for word in words if word not in stopwords.words('english')
    ]
    my_dict = dict([(word, True) for word in useful_words])
    return my_dict


## tweets collection
neg_strings = twitter_samples.strings('negative_tweets.json')
neg_tweets = []
for i, string in enumerate(neg_strings):
    # clean out smileys in strings
Пример #18
0
from collections import defaultdict

# First thing to do Installing NLTK
nltk.download()

# Secondly download Twitter_sample
nltk.download('twitter_samples')

# Finally we download the stopwords
nltk.download('stopwords')

# The twitter_samples corpus contains 3 files: 5000 positive tweets, 5000 negative tweets and 20.000 positive and negative tweets
# For this project, we will only be using a 10.000  dataset "twitter_sample" already
# available in the "nltk.corpus module" i.e., the files of the 5000 postive and 5000 negative tweets

print("Different type of tweet =>", twitter_samples.fileids())
pos_tweets = twitter_samples.strings('positive_tweets.json')
print("Len of POSITIVE tweet", len(pos_tweets))  #output : 5000
neg_tweets = twitter_samples.strings('negative_tweets.json')
print("Len of NEGATIVE tweet", len(neg_tweets))  #output : 5000
all_tweets = twitter_samples.strings('tweets.20150430-223406.json')
print("Length of TOTAL tweet from tweets.20150430-223406.json",
      len(all_tweets))  #output : 20000

# We import the TweetTokenizer Module first and then tokenize(split text into list)
tweet_tokenizer = TweetTokenizer(preserve_case=False,
                                 strip_handles=True,
                                 reduce_len=True)

# Denoise the tweet by removing $GE, $RT, hyperlink, #, words like a, and, the, is, are, etc, emoticones,
# punctuations and then convert word to Stem/Base by using Porter Stemming algorithmz