def get_tweet_JsonFiles(self, json_file2: None): if (json_file2 == None): all_tweets_samples = twitter_samples.fileids() json_file = all_tweet_samples[2] #json file tweet_string = twitter_samples.strings(json_file) return tweet_string tweet_string = json_file2 return tweet_string
def test_corpus_twitter_method_returns_correct_result(self): self.assertEqual(twitter_samples.fileids(), [ 'negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json' ]) self.assertEqual( twitter_samples.strings('negative_tweets.json')[0], 'hopeless for tmr :(') self.assertEqual( twitter_samples.strings('positive_tweets.json')[0], '#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)' )
def validate(self,classifier): """Test the accuracy of a given classifier against a test dataset with labels. Args: classifier: (Bayesain,DecisionTree,SVC,LinearSVC) for use in classifying data Returns: None """ tweets = twitter_samples.fileids() pos_tweets = twitter_samples.tokenized(tweets[1]) neg_tweets = twitter_samples.tokenized(tweets[0]) pos_testing = pos_tweets[(len(pos_tweets)*7/8):] neg_testing = neg_tweets[(len(neg_tweets)*7/8):] pos_test = [(self.train_feats(f), 'positive') for f in pos_testing ] neg_test = [(self.train_feats(f), 'negative') for f in neg_testing ] testfeats = pos_test + neg_test print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100)
def parseTweets(self): """Parses tweets and extracts features from it Args: none Returns: extract_feats: list of words with frequency of each word occuring in both positive and negative classes. """ tweets = twitter_samples.fileids() pos_tweets = twitter_samples.tokenized(tweets[1]) neg_tweets = twitter_samples.tokenized(tweets[0]) pos_training = pos_tweets[:(len(pos_tweets) * 7 / 8)] neg_training = neg_tweets[:(len(pos_tweets) * 7 / 8)] pos_feats = [(self.extract_feats(f), 'positive') for f in pos_training] neg_feats = [(self.extract_feats(f), 'negative') for f in neg_training] train_feats = pos_feats + neg_feats print '[-] Feature Extraction Finished' return train_feats
nltk.corpus, stopwords from nltk.corpus, PorterStemmer from nltk.stem.porter, ngrams from nltk, and re. It then imports a corpus of sample tweets about Brexit. The tweets are cleaned to remove special characters, hashtags, and twitter user IDs. The tweet text is then cleaned, tokenized, and stemmed. Finally, we compute frequency distributions to try to determine the most frequently used messages in the tweets. ''' import nltk from nltk.corpus import twitter_samples from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk import ngrams import re twitter_samples.fileids() tweets = twitter_samples.strings(twitter_samples.fileids()[-1]) porter_stemmer = PorterStemmer() def clean_text(tweet): tweet = tweet.strip() pattern = "(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT)" cleaned_tweet = ' '.join(re.sub(pattern," ",tweet).split()) wordTokens = nltk.word_tokenize(cleaned_tweet) wordTokens = [token.lower() for token in wordTokens if len(token)>1] stops = set(stopwords.words("english")) wordTokens = [token for token in wordTokens if token not in stops] cleaned_tweet = ' '.join(wordTokens) return cleaned_tweet # Remove special characters, stopwords, twitter IDs, and hashtags.
lm_classifier = lmModel.fit(training_features, training_labels) predictions = lm_classifier.predict(test_features) print("Precision of LinearRegression is") precision = calculate_precision(predictions, test_gold_labels) print("Test data\t" + str(precision)) #Real time tesing real_time_test(lm_classifier, vocabulary_mv) ###twitter sentiment analyzer import re nltk.download('words') from nltk.stem.porter import PorterStemmer porter = PorterStemmer() ts.fileids() twitt_str = ts.strings('tweets.20150430-223406.json') twitt_token = ts.tokenized('tweets.20150430-223406.json') #print(twitt_str) print(len(twitt_token)) data_words = nltk.word_tokenize(str(twitt_str)) data_words = [ data_words.lower() for data_words in data_words if data_words.isalpha() ] stemmed = [porter.stem(data_word) for data_word in data_words] print(stemmed) print(len(stemmed)) wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
# -*- coding: utf-8 -*- """ Created on Sat Oct 1 17:09:48 2016 @author: Admin """ import nltk from nltk.corpus import twitter_samples twitter_samples.fileids() """Accessing json file of positive tweets""" positive = nltk.corpus.twitter_samples.raw("positive_tweets.json") positive.__str__() postwts = nltk.word_tokenize(positive) """Length of all the positive tweets""" len(set(postwts)) from nltk.corpus import twitter_samples twitter_samples.fileids() """Accessing json file of negative tweets""" tweet = twitter_samples.raw("negative_tweets.json") tweet.__str__() tokens = nltk.word_tokenize(tweet) """Length of all the negative tweets""" len(set(tokens))
#! /usr/bin/env python from nltk.corpus import twitter_samples import json import csv print twitter_samples.fileids() pos_output_file = "pos_tweets_list.txt" neg_output_file = "neg_tweets_list.txt" def clean_up_files(filename): data = list() with open(filename, 'r') as f: for line in f: if len(line) > 1: data.append(line) with open(filename, 'w') as f: for line in data: f.write(line) pos_tweets_file = twitter_samples.abspath(twitter_samples.fileids()[1]) pos_tweets_output = open(pos_output_file, 'w+') with open(pos_tweets_file, 'r') as tf: for line in tf: x = json.loads(line) tweet = x['text'].encode('UTF-8') if '\n' not in tweet:
def twitterClass(): global wordFeatures tknzr = TweetTokenizer(strip_handles=True) onlyWords = re.compile('^[a-zA-Z]+$') # print if not os.path.exists(os.path.join(os.getcwd(), 'semtiment_classifier.pickle')): print twitter_samples.fileids() # print movie_reviews.fileids() # print tknzr = TweetTokenizer(strip_handles=True) onlyWords = re.compile('^[a-zA-Z]+$') labeledTweets = [] for it in twitter_samples.docs('negative_tweets.json'): tokens = [] for token in tknzr.tokenize(it['text']): if onlyWords.match(token) is not None: tokens.append(token.lower()) labeledTweets.append((tokens, "negative")) # print [token for token in tknzr.tokenize(it['text']) if onlyWords.match(token) is not None] for it in twitter_samples.docs('positive_tweets.json'): tokens = [] for token in tknzr.tokenize(it['text']): if onlyWords.match(token) is not None: tokens.append(token.lower()) labeledTweets.append((tokens, "positive")) # print labeledTweets wordFeatures = get_word_features(get_words_in_tweets(labeledTweets)) print "training" training = classUtil.apply_features(extract_features, labeledTweets) # print training sentimentClassifier = NaiveBayesClassifier.train(training) print "done training" f = open('semtiment_classifier.pickle', 'wb') pickle.dump(sentimentClassifier, f) f.close() else: fin = open('wordFeatures.json', "r") wordFeatures = json.load(fin) fin.close() print wordFeatures f = open('semtiment_classifier.pickle', 'rb') classifier = pickle.load(f) # type: nltk.classify.naivebayes.NaiveBayesClassifier f.close() # text,created_at tweets = [] onlyWords = re.compile('^[a-zA-Z]+$') labeledTweets = [] for row in csv.DictReader(open('datafiles/trump.csv')): text = row['text'] features = [] for token in tknzr.tokenize(text): if onlyWords.match(token) is not None: features.append(token.lower()) print row['created_at'] tweets.append({ "created_at": row['created_at'], "text": text, "classification": classifier.classify(extract_features(features)) }) classification = open('trumpClassified.json', 'w+') classification.write(json.dumps(tweets, indent=2)) classification.close() tweets = [] labeledTweets = [] for row in csv.DictReader(open('datafiles/clinton.csv')): text = row['text'] features = [] for token in tknzr.tokenize(text): if onlyWords.match(token) is not None: features.append(token.lower()) print row['created_at'] tweets.append({ "created_at": row['created_at'], "text": text, "classification": classifier.classify(extract_features(features)) }) classification = open('clintonClassified.json', 'w+') classification.write(json.dumps(tweets, indent=2)) classification.close()
import nltk from nltk.corpus import twitter_samples nltk.download('twitter_samples') print("Files: ", twitter_samples.fileids()) tweets = twitter_samples.strings('tweets.20150430-223406.json') print("Total tweets: ", len(tweets)) for tweet in tweets[:10]: print(tweet)
from nltk.corpus import twitter_samples, stopwords from nltk.tokenize import word_tokenize import re from collections import Counter import plotly.plotly as py import plotly.graph_objs as go print twitter_samples.fileids() stop_words = set(stopwords.words('english')) negTweets = twitter_samples.strings('negative_tweets.json') posTweets = twitter_samples.strings('positive_tweets.json') negWords = [] posWords = [] for tweet in posTweets: [posWords.append(x) for x in [w for w in word_tokenize(re.sub(r'RT |@\S*|#\S+|http\S+|\n-|w/|[\.]{2,}','',tweet)) if not w in stop_words]] for tweet in negTweets: [negWords.append(x) for x in [w for w in word_tokenize(re.sub(r'RT |@\S*|#\S+|http\S+|\n-|w/|[\.]{2,}','',tweet)) if not w in stop_words]] results = {} posWords=Counter(posWords) negWords=Counter(negWords) for word in posWords: if word in negWords: results[word] = posWords[word] - negWords[word] else: results[word] = posWords[word] for word in negWords: if not word in results: results[word] = 0 - negWords[word]
from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter, credsfromfile from nltk.corpus import twitter_samples #retrieving twitter data and interfacing with API oauth = credsfromfile() #tw = Twitter() #tw.tweets(to_screen=False, limit=25) #sample public twitter stream #client = Streamer(**oauth) #client.register(TweetViewer(limit=10)) #client.sample() #client = Query(**oauth) #tweets = client.search_tweets(keywords='nltk', limit = 50) #tweet = next(tweets) #from pprint import pprint #pprint(tweet, depth = 1) print twitter_samples.fileids() strings = twitter_samples.strings('tweets.20150430-223406.json') for string in strings[:15]: print(string)
def __init__(self): self.number_id = 41 self.source_id = "twitter_samples" self.titles = [name for name in twitter_samples.fileids()] self.data = [twitter_samples.raw(name) for name in self.titles]
from nltk.corpus import twitter_samples from nltk.twitter.common import json2csv #corpus twitter_sample tweets ~20k jsonfile = twitter_samples.fileids()[-1] #absolute path for the file: #input_file = os.path.abspath(jsonfile)=>returns virtualenv file path input_file = twitter_samples.abspath(jsonfile) #returns system /usr/share/ path #with open(input_file) as fp: #json2csv(fp,'tweets_text.csv',['text']) #json2csv(pointer, nameoffile, [feature1,feature2,feature3]) #think about the attributes to be imported, convert to panda, make a dataframe, apply stemming to tweet texts, save them. with open(input_file) as fp: json2csv(fp, 'tweets_dataframe.csv',['id','text','user.favourites_count','user.id','lang','user.followers_count','user.verified','truncated']) #json, csv
from nltk.corpus import twitter_samples, TwitterCorpusReader import numpy as np import matplotlib.pyplot as plt from NBClassifier import NBClassifier from SCClassifier import SCClassifier from BGClassifier import BGClassifier from sklearn.metrics import roc_curve, auc import os import pickle # settings fileIds = twitter_samples.fileids() root = twitter_samples.root negReader = TwitterCorpusReader(root, fileIds[0]) negTwt = [] posReader = TwitterCorpusReader(root, fileIds[1]) posTwt = [] for tweet in negReader.docs(): negTwt.append((tweet['text'])) for tweet in posReader.docs(): posTwt.append((tweet['text'])) posInd = np.random.permutation(len(posTwt)) negInd = np.random.permutation(len(negTwt)) X_1 = np.array([]) X_2 = np.array([]) X_3 = np.array([]) Y = np.array([]) NB_auc = np.zeros((5, 1))
tokenTweet = tknzr.tokenize(tweet) j = 0 k = 0 while j < (len(tokenTweet) - k): #print j if tokenTweet[j][0] == "#": tokenTweet[j] = tokenTweet[j][1:] elif tokenTweet[j][0] == "@": del tokenTweet[j] j-=1 k+=1 j+=1 info.append((word_feats(tokenTweet), classification)) ids = twitter_samples.fileids() neg = 0 pos = 1 negtweets = "negtweets.txt" postweets = "postweets.txt" #tags unused neginfo = [] posinfo = [] negtags = [] postags = [] getTweetTokens('neg', ids[0], neginfo, negtags) getTweetTokens('pos', ids[1], posinfo, postags) ##for i in range(2): ## print str(posinfo[i])
''' This is sentiment analysis from nltk samples and corpora Twitter samples will be used as data ''' import nltk.classify.util from nltk.classify import NaiveBayesClassifier from nltk.corpus import movie_reviews from nltk.corpus import stopwords from nltk.corpus import wordnet from nltk.corpus import twitter_samples from nltk.tokenize import word_tokenize import random # Collect data print(twitter_samples.fileids()) def create_word_features(words): useful_words = [ word for word in words if word not in stopwords.words('english') ] my_dict = dict([(word, True) for word in useful_words]) return my_dict ## tweets collection neg_strings = twitter_samples.strings('negative_tweets.json') neg_tweets = [] for i, string in enumerate(neg_strings): # clean out smileys in strings
from collections import defaultdict # First thing to do Installing NLTK nltk.download() # Secondly download Twitter_sample nltk.download('twitter_samples') # Finally we download the stopwords nltk.download('stopwords') # The twitter_samples corpus contains 3 files: 5000 positive tweets, 5000 negative tweets and 20.000 positive and negative tweets # For this project, we will only be using a 10.000 dataset "twitter_sample" already # available in the "nltk.corpus module" i.e., the files of the 5000 postive and 5000 negative tweets print("Different type of tweet =>", twitter_samples.fileids()) pos_tweets = twitter_samples.strings('positive_tweets.json') print("Len of POSITIVE tweet", len(pos_tweets)) #output : 5000 neg_tweets = twitter_samples.strings('negative_tweets.json') print("Len of NEGATIVE tweet", len(neg_tweets)) #output : 5000 all_tweets = twitter_samples.strings('tweets.20150430-223406.json') print("Length of TOTAL tweet from tweets.20150430-223406.json", len(all_tweets)) #output : 20000 # We import the TweetTokenizer Module first and then tokenize(split text into list) tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) # Denoise the tweet by removing $GE, $RT, hyperlink, #, words like a, and, the, is, are, etc, emoticones, # punctuations and then convert word to Stem/Base by using Porter Stemming algorithmz