def create_and_train(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] positive_cleaned_tokens_list = tokenize('positive_tweets.json') negative_cleaned_tokens_list = tokenize('negative_tweets.json') all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) return classifier
def trainmodel(): pos_tweets = twitter_samples.strings('positive_tweets.json') neg_tweets = twitter_samples.strings('negative_tweets.json') random.seed(1) pos_tweets_set = [] print("Training Phase") for tweet in pos_tweets: tweet = process_data(cleanText(tweet)) pos_tweets_set.append((tweet, 'pos')) print("POS added in positive tweets") neg_tweets_set = [] for tweet in neg_tweets: tweet = process_data(cleanText(tweet)) neg_tweets_set.append((tweet, 'neg')) print("NEG added in negative tweets") shuffle(pos_tweets_set) shuffle(neg_tweets_set) #test_set = pos_tweets_set[:100] + neg_tweets_set[:100] train_set = pos_tweets_set[100:2000] + neg_tweets_set[100:2000] print("Training started by naive bayes classifier") classifier = NaiveBayesClassifier(train_set) print("Training finished") # accuracy = classifier.accuracy(test_set) # print(accuracy*100) #print("Server started with model accuracy:"+str(accuracy*100)) return classifier
def fetch_twitter_samples(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') return positive_tweets, negative_tweets, text
def twitter_data_training(self): nagative_tweets = twitter_samples.strings('negative_tweets.json') positive_tweets = twitter_samples.strings('positive_tweets.json') feature_set = [] i=0 for tweets in positive_tweets: i+=1 print "twitterpos%s"%i if i == 2000: break words = self.clean_words(wordpunct_tokenize(tweets)) feature_set.append((self.create_feature_set(words)[0], 'pos')) feature_set.append((self.create_feature_set(words)[1], 'neg')) i=0 for tweets in nagative_tweets: i+=1 print "twitterneg%s"%i if i == 2000: break words = self.clean_words(wordpunct_tokenize(tweets)) feature_set.append((self.create_feature_set(words)[0], 'neg')) feature_set.append((self.create_feature_set(words)[1], 'pos')) random.shuffle(feature_set) training_set = feature_set[:8000] return training_set
def SentimentML(df): all_positive_tweets=twitter_samples.strings('positive_tweets.json') all_negative_tweets=twitter_samples.strings('negative_tweets.json') test_pos=all_positive_tweets[4000:] train_pos=all_positive_tweets[:4000] test_neg=all_negative_tweets[4000:] train_neg=all_negative_tweets[:4000] train_x=train_pos+train_neg test_x=test_pos+test_neg train_y=np.append(np.ones((len(train_pos),1)),np.zeros((len(train_neg),1)),axis=0) test_y=np.append(np.ones((len(test_pos),1)),np.zeros((len(test_neg),1)),axis=0) freqs=build_freqs(train_x,train_y) X=np.zeros((len(train_x),3)) for i in range(len(train_x)): X[i,:]= extract_features(train_x[i],freqs) Y=train_y J,theta=gradientDescent(X,Y,np.zeros((3,1)),1e-9,1500) tmp_accuracy=test_logistic_regression(test_x,test_y,freqs,theta) sentiment_l=[] l=df['totaltext'].tolist() for i in l: y_hat=predict_tweet(i,freqs,theta) if y_hat>0.5: sentiment_l.append('Positive sentiment') else: sentiment_l.append('Negative sentiment') df.insert(6,'sentiment_description',sentiment_l) return df
def lemm(words): lemmatizer = WordNetLemmatizer() words_lemm = [] for word in words: words_lemm.append(lemmatizer.lemmatize(word)) text_tagged = pos_tag(words_lemm) #print (text_tagged) stopset = set(stopwords.words('english')) - set( ('over', 'under', 'below', 'more', 'most', 'no', 'not', 'only', 'such', 'few', 'so', 'too', 'very', 'just', 'any', 'once')) from nltk.corpus import twitter_samples print(stopset) pos_tweets = twitter_samples.strings('positive_tweets.json') print(pos_tweets) neg_tweets = twitter_samples.strings('negative_tweets.json') print(neg_tweets) if (len(pos_tweets) == len(neg_tweets)): print("Same length") else: print("Different lengths")
def __init__(self): pos = twitter_samples.strings('positive_tweets.json') neg = twitter_samples.strings('negative_tweets.json') self.x = pos + neg self.y = np.append(np.ones(len(pos)), np.zeros(len(neg))) self.freqs = self.count_tweets() self.logprior, self.loglikelihood = self.NB_train()
def trainRandomForest(): stop_words = stopwords.words('english') positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') X = positive_tweets + negative_tweets positives = np.ones([len(positive_tweets), 1]) negatives = np.zeros([len(negative_tweets), 1]) y = np.concatenate([positives, negatives]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, shuffle=True) pipe = Pipeline([ ('tokenize', FunctionTransformer(tokenizeIt)), ('noise', FunctionTransformer(removeIt)), ('tfidf', TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)), ('classifier', RandomForestClassifier(n_estimators=100, random_state=1)) ]) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) with open('trainedpipe.pkl', 'wb') as f: pickle.dump(pipe, f)
def deal_trainset(stop_words, english_punctuations): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for pos_sen in positive_tweets: positive_cleaned_tokens_list.append( Cleaner(Tokenization(pos_sen), stop_words, english_punctuations)) for neg_sen in negative_tweets: negative_cleaned_tokens_list.append( Cleaner(Tokenization(neg_sen), stop_words, english_punctuations)) print(positive_cleaned_tokens_list) pos_model = [] for pos_sen in positive_cleaned_tokens_list: pos_model.append(dict([word, True] for word in pos_sen)) # print(pos_model) pos_dataset = [(pos_dict, "Positive") for pos_dict in pos_model] # print(pos_dataset) neg_model = [] for neg_sen in negative_cleaned_tokens_list: neg_model.append(dict([word, True] for word in neg_sen)) neg_dataset = [(neg_dict, "Negative") for neg_dict in neg_model] return pos_dataset, neg_dataset
def trainNaiveBayesClassifier(self): from nltk.corpus import twitter_samples pos_tweets = twitter_samples.strings('positive_tweets.json') neg_tweets = twitter_samples.strings('negative_tweets.json') # positive tweets feature set pos_tweets_set = [] for tweet in pos_tweets: pos_tweets_set.append((self.bag_of_words(tweet), 'pos')) # negative tweets feature set neg_tweets_set = [] for tweet in neg_tweets: neg_tweets_set.append((self.bag_of_words(tweet), 'neg')) # radomize pos_reviews_set and neg_reviews_set # doing so will output different accuracy result everytime we run the program shuffle(pos_tweets_set) shuffle(neg_tweets_set) test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000] train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:] self.classifier = NaiveBayesClassifier.train(train_set)
def preprocess_data(self): self.pos_tweets = twitter_samples.strings('positive_tweets.json') self.neg_tweets = twitter_samples.strings('negative_tweets.json') self.text = twitter_samples.strings('tweets.20150430-223406.json') self.pos_tweet_tokens = twitter_samples.tokenized( 'positive_tweets.json') self.neg_tweet_tokens = twitter_samples.tokenized( 'negative_tweets.json') self.pos_cleaned_tokens_list = [ self.remove_noise(tokens) for tokens in self.pos_tweet_tokens ] self.neg_cleaned_tokens_list = [ self.remove_noise(tokens) for tokens in self.neg_tweet_tokens ] self.all_pos_words = self.get_all_words(self.pos_cleaned_tokens_list) self.all_neg_words = self.get_all_words(self.neg_cleaned_tokens_list) self.freq_dist_pos = FreqDist(self.all_pos_words) self.freq_dist_neg = FreqDist(self.all_neg_words) self.pos_tokens_for_model = self.get_tweets_for_model( self.pos_cleaned_tokens_list) self.neg_tokens_for_model = self.get_tweets_for_model( self.neg_cleaned_tokens_list) self.pos_dataset = [(tweet_dict, "Positive") for tweet_dict in self.pos_tokens_for_model] self.neg_dataset = [(tweet_dict, "Negative") for tweet_dict in self.neg_tokens_for_model] self.dataset = self.pos_dataset + self.neg_dataset random.shuffle(self.dataset) mid = len(self.dataset) // 2 self.train_data = self.dataset[:mid] self.test_data = self.dataset[mid:]
def bayes_classifier(filename): #Function for the bayes classifier with add 1 smoothing for the specified file of tweets output_file = open('shooting.txt', 'w', encoding='utf-8') num_pos_predictions = 0 num_neg_predictions = 0 k = (pos_fd + neg_fd).B() #total number of bins pos_count = len(twitter_samples.strings('positive_tweets.json')) #get the number of positive tokens neg_count = len(twitter_samples.strings('negative_tweets.json')) #number of negative tokens log_prior_pos = math.log(pos_count / (pos_count + neg_count)) #logs for very small probabilities log_prior_neg = math.log(neg_count / (pos_count + neg_count)) tweets = [] with open(filename) as f: #open the specified json file containing tweets and load each line as a seperate tweet for line in f: tweets.append(json.loads(line)) for tweet in tweets: # for each tweet perform the sentiment analysis total_log_prob_pos = log_prior_pos total_log_prob_neg = log_prior_neg tokens = tweet_tokenizer.tokenize(tweet["text"]) #get the raw tweet text from each tweet for token in tokens: total_log_prob_neg += math.log((neg_fd[token] + 1) / neg_fd.N() + k) #bayes formula for pos/neg probability total_log_prob_pos += math.log((pos_fd[token] + 1) / pos_fd.N() + k) if total_log_prob_pos > total_log_prob_neg: #if it is more likely to be positive num_pos_predictions += 1 print('pos', file=output_file) #record to output file else: num_neg_predictions += 1 print('neg', file=output_file) print('\nnumber of positive tweets: ', num_pos_predictions, file=output_file) print('number of negative tweets: ', num_neg_predictions, file=output_file)
def driver(): # String varaiables of the dataset positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] text = twitter_samples.strings('tweets.20150430-223406.json') stop_words = stopwords.words('english') # Tokenized variables of dataset # tweet_tokens = twitter_samples.tokenized('positive_tweets.json') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') # Cleaning the noise in the tweet tokens for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) # Frequency distribution for cleaned words all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) # Creating positive and negative dictionaries positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) # Creating the final dataset for training positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset # Shuffling for suedo-randomness and avoid bias random.shuffle(dataset) # Dividing shuffled data into train and test data train_data = dataset[:7000] test_data = dataset[7000:] # Initializing the classifier classifier = NaiveBayesClassifier.train(train_data) custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again." custom_tokens = remove_noise(word_tokenize(custom_tweet)) print(custom_tweet, classifier.classify( dict([token, True] for token in custom_tokens)))
def main_code(analysis_input): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) # print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) # print("Accuracy is:", classify.accuracy(classifier, test_data)) # print(classifier.show_most_informative_features(10)) custom_tokens = remove_noise(word_tokenize(analysis_input)) pos_or_neg = str( classifier.classify(dict([token, True] for token in custom_tokens))) return pos_or_neg
def train_model(): """ Trains a Naive Bayes sentiment classifier using the twitter_samples dataset from NLTK. Each tweet is tokenized and cleaned to produce a training dataset for the machine learning model. Parameters ---------- Returns ------- NaiveBayesClassifier """ #Load dataset from nltk data positive_tweets = twitter_samples.strings("positive_tweets.json") negative_tweets = twitter_samples.strings("negative_tweets.json") #Retrieve english stop words stop_words = stopwords.words("english") #Tweet tokenization positive_tweet_tokens = twitter_samples.tokenized("positive_tweets.json") negative_tweet_tokens = twitter_samples.tokenized("negative_tweets.json") positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] #Token cleaning for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) #Extract words from tokens all_pos_words = get_all_words(positive_cleaned_tokens_list) #Frequency distribition of words freq_dist_pos = FreqDist(all_pos_words) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) #Create datasets positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] #Merge individual datasets into singular training data dataset = positive_dataset + negative_dataset train_data = dataset classifier = NaiveBayesClassifier.train(train_data) return classifier
def all_data(): if os.path.exists(r'C:\Users\baiyang01\AppData\Roaming\nltk_data\corpora\twitter_samples'): print('Files already exists.') pass else: nltk.download('twitter_samples') # select the set of positive and negative tweets print("I'm downloading the file") positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') return positive_tweets, negative_tweets
def calibrate(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized( 'positive_tweets.json' ) #files downloaded from setup.py used to calibrate the classifer for sentiment analysis negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) #print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [ (tweet_dict, "Positive") #calibrating positive for tweet_dict in positive_tokens_for_model ] negative_dataset = [ (tweet_dict, "Negative") #calibrating negative for tweet_dict in negative_tokens_for_model ] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] global classifier classifier = NaiveBayesClassifier.train(train_data) #trains the data! print("Calibration complete!") print("Accuracy is:", classify.accuracy(classifier, test_data))
def __get_labeled_tweets(self): """ Get labeled tweets from nltk :return: cleaned list of lists that contain tokens for each tweets """ pos_samples = twitter_samples.strings( 'positive_tweets.json')[:self.__sample_size] neg_samples = twitter_samples.strings( 'negative_tweets.json')[:self.__sample_size] # print(pos_samples+neg_samples) # show raw tweets sample form nltk return self.algorithm.process_tweets(pos_samples + neg_samples)
def upload(): pos_tweets = twitter_samples.strings('positive_tweets.json') neg_tweets = twitter_samples.strings('negative_tweets.json') all_tweets = twitter_samples.strings('tweets.20150430-223406.json') for tweet in pos_tweets: pos_tweets_set.append((bag_of_words(tweet), 'pos')) for tweet in neg_tweets: neg_tweets_set.append((bag_of_words(tweet), 'neg')) text.delete('1.0', END) text.insert( END, "NLTK Total No Of Tweets Found : " + str(len(pos_tweets_set) + len(neg_tweets_set)) + "\n")
def load_data(): nltk.download('twitter_samples') nltk.download('stopwords') all_positive_tweets = twitter_samples.strings('positive_tweets.json') all_negative_tweets = twitter_samples.strings('negative_tweets.json') X = all_positive_tweets + all_negative_tweets y = np.append(np.ones((len(all_positive_tweets), 1)), np.zeros((len(all_negative_tweets), 1)), axis=0) return X, y
def test_corpus_twitter_method_returns_correct_result(self): self.assertEqual(twitter_samples.fileids(), [ 'negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json' ]) self.assertEqual( twitter_samples.strings('negative_tweets.json')[0], 'hopeless for tmr :(') self.assertEqual( twitter_samples.strings('positive_tweets.json')[0], '#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)' )
def train_social(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') # tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] # Classifier - TODO Add persistence classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(100)) custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again." custom_tokens = remove_noise(word_tokenize(custom_tweet)) print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens))) return classifier
def train(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) f = open('classifier.pickle', 'wb') pickle.dump(classifier, f) f.close()
def __init__(self): nltk.download('twitter_samples') nltk.download('stopwords') nltk.download('wordnet') pos_tweets = twitter_samples.strings('positive_tweets.json') neg_tweets = twitter_samples.strings('negative_tweets.json') self.stopwords_english = stopwords.words('english') self.lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() self.classifier = self.generateModel(pos_tweets, neg_tweets) with open('melbourne_suburbs.geojson') as f: self.geoJs = json.load(f)
def ml_model(): pos_tweets = twitter_samples.strings('positive_tweets.json') pos_tweets = vocab_gen(pos_tweets, 'pos') neg_tweets = twitter_samples.strings('negative_tweets.json') neg_tweets = vocab_gen(neg_tweets, 'neg') neu_tweets = twitter_samples.strings('neutral_tweets.json') neu_tweets = vocab_gen(neu_tweets, 'neu') test_set = pos_tweets[:1000] + neg_tweets[:1000] + neu_tweets[:1000] train_set = pos_tweets[1000:] + neg_tweets[1000:] + neu_tweets[1000:] classifier = NaiveBayesClassifier.train(train_set) accuracy = classify.accuracy(classifier, test_set) print(accuracy) joblib.dump(classifier, 'twitter_sent.pkl')
def train_twtr_classifier(): if os.path.isfile(SAVED_CLSR_LOC): return load_classifier() positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') pos_twt_toks = twitter_samples.tokenized('positive_tweets.json') neg_twt_toks = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [ remove_noise(toks, stop_words) for toks in pos_twt_toks ] negative_cleaned_tokens_list = [ remove_noise(toks, stop_words) for toks in neg_twt_toks ] all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) save_classifier(classifier) return classifier
def __init__(self): """ Gather data """ positive = twitter_samples.strings('positive_tweets.json') negative = twitter_samples.strings('negative_tweets.json') self.stop_words = list(set(stopwords.words('english'))) positive_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tokens = twitter_samples.tokenized('negative_tweets.json') """ Clean the data """ positive_clean = [] negative_clean = [] for token in positive_tokens: positive_clean.append(self.clean(token)) for token in negative_tokens: negative_clean.append(self.clean(token)) positive_model_tokens = self.final_token_generator(positive_clean) negative_model_tokens = self.final_token_generator(negative_clean) """ Use generator to make datasets """ positive_dataset = [(token, "Positive") for token in positive_model_tokens] negative_dataset = [(token, "Negative") for token in negative_model_tokens] dataset = positive_dataset + negative_dataset """ Shake it all about """ random.shuffle(dataset) random.shuffle(dataset) random.shuffle(dataset) """ Split them up """ training = dataset[:7000] testing = dataset[7000:] """ Train the classifier """ self.classifier = NaiveBayesClassifier.train(training) """
def __init__(self): pos_tweets = [(x, 'Positive') for x in twitter_samples.strings('positive_tweets.json')] neg_tweets = [(x, 'Negative') for x in twitter_samples.strings('negative_tweets.json')] full_dataset = pos_tweets + neg_tweets random.shuffle(full_dataset) dataset_size = len(full_dataset) # Have to divide the dataset. Larger datasets can result in a SIGKILL # Probably due to limited memory in the docker container(needs further investigation) train_size = dataset_size//5 train_dataset = full_dataset[:train_size] self.nb_classifier = NaiveBayesClassifier(train_dataset)
class preprocessing: # select the lists of positive andpro negative tweets all_positive_tweets = twitter_samples.strings('positive_tweets.json') all_negative_tweets = twitter_samples.strings('negative_tweets.json') # concatenate the lists, 1st part is the positive tweets followed by the negative tweets = all_positive_tweets + all_negative_tweets # make a numpy array representing labels of the tweets labels = np.append(np.ones((len(all_positive_tweets))), np.zeros((len(all_negative_tweets)))) processed tweets = [] for tweet in tweets: processed tweets = process_tweet(tweet) # Preprocess a given tweet # create frequency dictionary freqs = build_freqs(processed tweets, labels) # list representing our table of word counts. # each element consist of a sublist with this pattern: [<word>, <positive_count>, <negative_count>]pro data = [] # loop through our selected words for word in keys: # initialize positive and negative counts pos = 0 neg = 0 # retrieve number of positive counts if (word, 1) in freqs: pos = freqs[(word, 1)] # retrieve number of negative counts if (word, 0) in freqs: neg = freqs[(word, 0)] # append the word counts to the table data.append([word, pos, neg]) def get_preprocessed_data(): return processed tweets def get_freqs_dict(): return freqs def get_freqs_table(): return data
def data_preprocess(): # get the sets of positive and negative tweets all_positive_tweets = twitter_samples.strings('positive_tweets.json') all_negative_tweets = twitter_samples.strings('negative_tweets.json') # split the data into two pieces, one for training and one for testing (validation set) test_pos = all_positive_tweets[4000:] train_pos = all_positive_tweets[:4000] test_neg = all_negative_tweets[4000:] train_neg = all_negative_tweets[:4000] train_x = train_pos + train_neg test_x = test_pos + test_neg # avoid assumptions about the length of all_positive_tweets train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg))) test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg))) return train_x, train_y, test_x, test_y
def corpusreader_demo(): """ Use :module:`TwitterCorpusReader` tp read a file of tweets, and print out * some full tweets in JSON format; * some raw strings from the tweets (i.e., the value of the `text` field); and * the result of tokenising the raw strings. """ from nltk.corpus import twitter_samples as tweets print() print("Complete tweet documents") print(SPACER) for tweet in tweets.docs("tweets.20150430-223406.json")[:1]: print(json.dumps(tweet, indent=1, sort_keys=True)) print() print("Raw tweet strings:") print(SPACER) for text in tweets.strings("tweets.20150430-223406.json")[:15]: print(text) print() print("Tokenized tweet strings:") print(SPACER) for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]: print(toks)
def getFeatures(numWordsToUse): # these emoticons cover 98.6% of the training data emoticons = [':)','(:',': )','( :','=)','(=','= )','( =',':D',': D',':p',': p',':(','):',': (',') :','=(',')=','= (',') =',':D',': D',':p',': p',':-)','(-:',':- )','( -:',':-(',')-:',':- (',') -:'] emoticons = set(emoticons) # NLTK has their own twitter corpus with positive and negative messages positiveTweets = twitter_samples.strings('positive_tweets.json') negativeTweets = twitter_samples.strings('negative_tweets.json') positiveSentiment = [1 for x in positiveTweets] negativeSentiment = [0 for x in positiveTweets] tweets = positiveTweets + negativeTweets sentiment = positiveSentiment + negativeSentiment tokenizedTweets, cleanedSentiment = utils.tokenize(tweets, sentiment) cleanedTweets = [] linesCleaned = 0 for tweet in tokenizedTweets: replacedEmoticon = 0 cleanedTweet = [] for word in tweet: if word not in emoticons: cleanedTweet.append(word) else: replacedEmoticon = 1 cleanedTweets.append(cleanedTweet) linesCleaned += replacedEmoticon global popularWords formattedTweets, sentiment, popularWords = utils.nlpFeatureEngineering( cleanedTweets, cleanedSentiment, 0, numWordsToUse, 'counts' ) # transform list of dictionaries into a sparse matrix sparseFeatures = dv.fit_transform(formattedTweets) return sparseFeatures, sentiment
def __init__(self): twitter = twitter_samples.strings('tweets.20150430-223406.json') news = brown.words(categories='news') twitter = Tweet.Tweet(text = ' '.join(twitter)).stem() news = Tweet.Tweet(text = ' '.join(news)).stem() self.twitter_freq = nltk.FreqDist(twitter) self.tsum = len(twitter) self.news_freq = nltk.FreqDist(news) self.nsum = len(news)
def main(): """ Replace the brown corpus with other corpora or use your own textfile like so: f = open(filename) t = f.read() """ from nltk.corpus import twitter_samples words = [] for sentence in twitter_samples.strings(): words += nltk.word_tokenize(sentence) + ["."] chain = WordChain(words) print chain.build_sentence()
def getFeatures(numWordsToUse): # NLTK has their own twitter corpus with positive and negative messages positiveTweets = twitter_samples.strings('positive_tweets.json') negativeTweets = twitter_samples.strings('negative_tweets.json') positiveSentiment = [1 for x in positiveTweets] negativeSentiment = [0 for x in positiveTweets] tweets = positiveTweets + negativeTweets sentiment = positiveSentiment + negativeSentiment tokenizedTweets, cleanedSentiment = utils.tokenize(tweets, sentiment) global popularWords formattedTweets, sentiment, popularWords = utils.nlpFeatureEngineering( tokenizedTweets, cleanedSentiment, 0, numWordsToUse, 'counts' ) # transform list of dictionaries into a sparse matrix sparseFeatures = dv.fit_transform(formattedTweets) return sparseFeatures, sentiment
testing = allFeatures[len(allFeatures) / 2:] subject_classifier = nltk.NaiveBayesClassifier.train(training) fh = open('filter.txt', 'r') labelSubjects = [] for line in fh: labelSubject = line.split(',') label = labelSubject[0].strip() subject = labelSubject[1].strip() labelSubjects.append( (label, subject) ) fh.close() #seniment classifier pos_tweets = twitter_samples.strings('positive_tweets.json') neg_tweets = twitter_samples.strings('negative_tweets.json') pos_tuples = [] neg_tuples = [] for string in pos_tweets: pos_tuples.append((string, 'positive')) for string in neg_tweets: neg_tuples.append((string, 'negative')) tweets = [] #filtering out words that are user names, less than 3 characters, and http addresses for (words, sentiment) in pos_tuples + neg_tuples: words_filtered = [e.lower() for e in words.split() if len(e) >= 3 and e[0] != '@' and e[0:4] != 'http'] tweets.append((words_filtered, sentiment)) word_features = get_word_features(get_words_in_tweets(tweets))
from nltk.classify import NaiveBayesClassifier from nltk.corpus import subjectivity from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * from nltk.corpus import twitter_samples import unicodedata from pymongo import MongoClient strings_negative = twitter_samples.strings('negative_tweets.json') strings_positive = twitter_samples.strings('positive_tweets.json') neg_docs = [(unicodedata.normalize('NFKD', sent).encode('ascii','ignore').split(), -1) for sent in strings_negative] pos_docs = [(unicodedata.normalize('NFKD', sent).encode('ascii','ignore').split(), 1) for sent in strings_positive] # print len(neg_docs), len(pos_docs) # print neg_docs[0] train_neg_docs = neg_docs[:5000] train_pos_docs = pos_docs[:5000] training_docs = train_neg_docs+train_pos_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_docs) trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, training_set) '''
#!/usr/bin/env python # -*- coding: utf-8 -*- from nltk.twitter import Twitter import os var = os.environ os.environ["TWITTER"] = "C:/Users/admin/Documents/twitter-files" tw = Twitter() tw.tweets(keywords='algeria, algerie', limit=10) # sample from the public stream from nltk.corpus import twitter_samples strings = twitter_samples.strings('tweets.20150430-223406.json') for string in strings[:15]: print(string)
##Example: #1) Scrape tweets from Twitter that have #bucs, #buccaneers, or #siegetheday in their text and are in English #2) Save these tweets as a row to a .csv file import twitterscraper with open('nflbucs.csv','a',newline = '',encoding='utf-8') as fil: writer = csv.writer(fil) for tweet in twitterscraper.query_tweets("%23bucs%20OR%20%23buccaneers%20OR%20%23siegetheday%20lang%3Aen%20include%3Aretweets", 1000): writer.writerow(tweet) ####Train classifier based on tweet data #0) Load data and setup from nltk.corpus import twitter_samples ##take a sample of data twitter_samples.strings('positive_tweets.json')[1] twitter_samples.strings('negative_tweets.json')[1] ##create function word_feats() to turn string into a dictionary def word_feats(words): return dict([(word, True) for word in words]) #1) a) Tokenize tweets from sample data #b) Use word_feats() to create a dictionary out of the tokenized words #c) Create list variable of positive and negative features using the dictionary from (b) and append 'pos' or 'neg' import nltk posfeats = [(word_feats(nltk.TweetTokenizer(preserve_case = False).tokenize(row)),'pos') for row in twitter_samples.strings('positive_tweets.json')] len(posfeats) #check length - equivalent to number of tweets negfeats = [(word_feats(nltk.TweetTokenizer(preserve_case = False).tokenize(row)),'neg') for row in twitter_samples.strings('negative_tweets.json')] len(negfeats) #check length - equivalent to number of tweets