def data_set(self): if self.n_tweets is not None: self.n_tweets = int(self.n_tweets/2) tokenizer = TweetTokenizer() fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json,positive_csv,fields,limit=self.n_tweets) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, limit=self.n_tweets) neg_docs = parse_tweets_set(negative_csv, label='neg') pos_docs = parse_tweets_set(positive_csv, label='pos') # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs+train_neg_docs testing_tweets = test_pos_docs+test_neg_docs training_tweets = train_pos_docs+train_neg_docs testing_tweets = test_pos_docs+test_neg_docs return training_tweets, testing_tweets
def setUp(self): with open(twitter_samples.abspath( "tweets.20150430-223406.json")) as infile: self.infile = [next(infile) for x in range(100)] infile.close() self.msg = "Test and reference files are not the same" self.subdir = os.path.join(os.path.dirname(__file__), 'files')
def load_twitter(): tokenizer = TweetTokenizer(preserve_case=False) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, limit=None) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, limit=None) neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer) pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer) return pos_docs, neg_docs
def read_single_tweets(path): word_list = [] if os.path.isfile(path): input_tweets = twitter_samples.abspath(os.path.abspath(path)) output_tweets = os.path.join( os.path.dirname(path) + '_text', os.path.basename(path) + '.csv') os.makedirs(os.path.dirname(output_tweets), exist_ok=True) try: with open(input_tweets) as fp: json2csv(fp, output_tweets, ['text']) with open(output_tweets, 'r') as fp: reader = csv.DictReader(fp) for row in reader: try: tweet = row['text'] if detect(tweet) == 'en': word_list.append(clean_and_tokenize(tweet)) except lang_detect_exception.LangDetectException: continue except: print(path) return word_list
def demo_vader_tweets(n_instances=None, output=None): """ Classify 10000 positive and negative tweets using Vader approach. :param n_instances: the number of total tweets that have to be classified. :param output: the output file where results have to be reported. """ from collections import defaultdict from nltk.corpus import twitter_samples from nltk.sentiment import SentimentIntensityAnalyzer from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision, recall as eval_recall, f_measure as eval_f_measure) if n_instances is not None: n_instances = int(n_instances / 2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, strip_off_emoticons=False, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, strip_off_emoticons=False, limit=n_instances) pos_docs = parse_tweets_set(positive_csv, label='pos') neg_docs = parse_tweets_set(negative_csv, label='neg') # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs + train_neg_docs testing_tweets = test_pos_docs + test_neg_docs vader_analyzer = SentimentIntensityAnalyzer() gold_results = defaultdict(set) test_results = defaultdict(set) acc_gold_results = [] acc_test_results = [] labels = set() num = 0 for i, (text, label) in enumerate(testing_tweets): labels.add(label) gold_results[label].add(i) acc_gold_results.append(label) score = vader_analyzer.polarity_scores(text)['compound'] if score > 0: observed = 'pos' else: observed = 'neg' num += 1 acc_test_results.append(observed) test_results[observed].add(i) metrics_results = {} for label in labels: accuracy_score = eval_accuracy(acc_gold_results, acc_test_results) metrics_results['Accuracy'] = accuracy_score precision_score = eval_precision(gold_results[label], test_results[label]) metrics_results['Precision [{0}]'.format(label)] = precision_score recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results['Recall [{0}]'.format(label)] = recall_score f_measure_score = eval_f_measure(gold_results[label], test_results[label]) metrics_results['F-measure [{0}]'.format(label)] = f_measure_score for result in sorted(metrics_results): print('{0}: {1}'.format(result, metrics_results[result])) if output: output_markdown(output, Approach='Vader', Dataset='labeled_tweets', Instances=n_instances, Results=metrics_results)
def demo_tweets(trainer, n_instances=None, output=None): """ Train and test Naive Bayes classifier on 10000 tweets, tokenized using TweetTokenizer. Features are composed of: - 1000 most frequent unigrams - 100 top bigrams (using BigramAssocMeasures.pmi) :param trainer: `train` method of a classifier. :param n_instances: the number of total tweets that have to be used for training and testing. Tweets will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.tokenize import TweetTokenizer from nltk.sentiment import SentimentAnalyzer from nltk.corpus import twitter_samples, stopwords # Different customizations for the TweetTokenizer tokenizer = TweetTokenizer(preserve_case=False) # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) if n_instances is not None: n_instances = int(n_instances / 2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances) neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer) pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer) # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs + train_neg_docs testing_tweets = test_pos_docs + test_neg_docs sentim_analyzer = SentimentAnalyzer() # stopwords = stopwords.words('english') # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] all_words = [word for word in sentim_analyzer.all_words(training_tweets)] # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Add bigram collocation features bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats( [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12) sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats) training_set = sentim_analyzer.apply_features(training_tweets) test_set = sentim_analyzer.apply_features(testing_tweets) classifier = sentim_analyzer.train(trainer, training_set) # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) try: classifier.show_most_informative_features() except AttributeError: print( 'Your classifier does not provide a show_most_informative_features() method.' ) results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__, Tokenizer=tokenizer.__class__.__name__, Feats=extr, Results=results, Instances=n_instances)
def demo_vader_tweets(n_instances=None, output=None): """ Classify 10000 positive and negative tweets using Vader approach. :param n_instances: the number of total tweets that have to be classified. :param output: the output file where results have to be reported. """ from collections import defaultdict from nltk.corpus import twitter_samples from nltk.sentiment import SentimentIntensityAnalyzer from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision, recall as eval_recall, f_measure as eval_f_measure) if n_instances is not None: n_instances = int(n_instances/2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, strip_off_emoticons=False, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, strip_off_emoticons=False, limit=n_instances) pos_docs = parse_tweets_set(positive_csv, label='pos') neg_docs = parse_tweets_set(negative_csv, label='neg') # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs+train_neg_docs testing_tweets = test_pos_docs+test_neg_docs vader_analyzer = SentimentIntensityAnalyzer() gold_results = defaultdict(set) test_results = defaultdict(set) acc_gold_results = [] acc_test_results = [] labels = set() num = 0 for i, (text, label) in enumerate(testing_tweets): labels.add(label) gold_results[label].add(i) acc_gold_results.append(label) score = vader_analyzer.polarity_scores(text)['compound'] if score > 0: observed = 'pos' else: observed = 'neg' num += 1 acc_test_results.append(observed) test_results[observed].add(i) metrics_results = {} for label in labels: accuracy_score = eval_accuracy(acc_gold_results, acc_test_results) metrics_results['Accuracy'] = accuracy_score precision_score = eval_precision(gold_results[label], test_results[label]) metrics_results['Precision [{0}]'.format(label)] = precision_score recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results['Recall [{0}]'.format(label)] = recall_score f_measure_score = eval_f_measure(gold_results[label], test_results[label]) metrics_results['F-measure [{0}]'.format(label)] = f_measure_score for result in sorted(metrics_results): print('{0}: {1}'.format(result, metrics_results[result])) if output: output_markdown(output, Approach='Vader', Dataset='labeled_tweets', Instances=n_instances, Results=metrics_results)
def demo_tweets(trainer, n_instances=None, output=None): """ Train and test Naive Bayes classifier on 10000 tweets, tokenized using TweetTokenizer. Features are composed of: - 1000 most frequent unigrams - 100 top bigrams (using BigramAssocMeasures.pmi) :param trainer: `train` method of a classifier. :param n_instances: the number of total tweets that have to be used for training and testing. Tweets will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.tokenize import TweetTokenizer from nltk.sentiment import SentimentAnalyzer from nltk.corpus import twitter_samples, stopwords # Different customizations for the TweetTokenizer tokenizer = TweetTokenizer(preserve_case=False) # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) if n_instances is not None: n_instances = int(n_instances/2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances) neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer) pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer) # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs+train_neg_docs testing_tweets = test_pos_docs+test_neg_docs sentim_analyzer = SentimentAnalyzer() # stopwords = stopwords.words('english') # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] all_words = [word for word in sentim_analyzer.all_words(training_tweets)] # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Add bigram collocation features bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets], top_n=100, min_freq=12) sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats) training_set = sentim_analyzer.apply_features(training_tweets) test_set = sentim_analyzer.apply_features(testing_tweets) classifier = sentim_analyzer.train(trainer, training_set) # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) try: classifier.show_most_informative_features() except AttributeError: print('Your classifier does not provide a show_most_informative_features() method.') results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__, Tokenizer=tokenizer.__class__.__name__, Feats=extr, Results=results, Instances=n_instances)
def infile(): with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile: return [next(infile) for x in range(100)]
# -*- coding: utf-8 -*- ''' Created on 2017-02-11 16:07:41 提取Twitter数据实体 @author: zhoujiagen ''' # 字段的说明: # https://dev.twitter.com/overview/api/tweets # https://dev.twitter.com/overview/api/entities # https://dev.twitter.com/overview/api/users from nltk.corpus import twitter_samples # from nltk.twitter.common import json2csv import json input_file = twitter_samples.abspath("tweets.20150430-223406.json") # 准备数据 # # 图数据模型 # 节点: User, Word, URL, Hashtag # 关系: mentions_user, retweets_user, follows_user, mentions_hashtag, uses_word, mentions_url # user.screen_name(user), user.name, user.location # text # entities.hashtags # entities.user_mentions(mentions) # entities.urls # retweeted def extract_tweet_info(tweet): result = {}
neg_output_file = "neg_tweets_list.txt" def clean_up_files(filename): data = list() with open(filename, 'r') as f: for line in f: if len(line) > 1: data.append(line) with open(filename, 'w') as f: for line in data: f.write(line) pos_tweets_file = twitter_samples.abspath(twitter_samples.fileids()[1]) pos_tweets_output = open(pos_output_file, 'w+') with open(pos_tweets_file, 'r') as tf: for line in tf: x = json.loads(line) tweet = x['text'].encode('UTF-8') if '\n' not in tweet: tweet += '\n' if (len(tweet) > 4): pos_tweets_output.write(tweet) pos_tweets_output.close() neg_tweets_file = twitter_samples.abspath(twitter_samples.fileids()[0]) neg_tweets_output = open(neg_output_file, 'w+') with open(neg_tweets_file, 'r') as tf:
oauth = credsfromfile() n = 10 # 設定拿取 tweets 資料則數 username = '******' # Query client = Query(**oauth) # 歷史資料 client.register(TweetWriter()) # 寫入 client.user_tweets(username, n) # 拿取 tweets 資料(n則) ''' 使用 json2csv 存取 tweets 資料 (text欄位) input_file 的 abspath 需參考上述 Query 寫入資料的路徑做修改 ''' input_file = twitter_samples.abspath('/Users/youngmihuang/twitter-files/tweets.20180726-155316.json') with open(input_file) as fp: json2csv(fp, 'tweets_text.csv', ['text']) # 讀取 data = pd.read_csv('tweets_text.csv') for line in data.text: print('Trump tweets content: ') print(line) # 斷詞 tokenized = twitter_samples.tokenized(input_file) for tok in tokenized[:5]: print('tokenized: ') print(tok)
def setUp(self): with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile: self.infile = [next(infile) for x in range(100)] infile.close() self.msg = "Test and reference files are not the same" self.subdir = os.path.join(os.path.dirname(__file__), 'files')
from nltk.corpus import twitter_samples from nltk.twitter.common import json2csv #corpus twitter_sample tweets ~20k jsonfile = twitter_samples.fileids()[-1] #absolute path for the file: #input_file = os.path.abspath(jsonfile)=>returns virtualenv file path input_file = twitter_samples.abspath(jsonfile) #returns system /usr/share/ path #with open(input_file) as fp: #json2csv(fp,'tweets_text.csv',['text']) #json2csv(pointer, nameoffile, [feature1,feature2,feature3]) #think about the attributes to be imported, convert to panda, make a dataframe, apply stemming to tweet texts, save them. with open(input_file) as fp: json2csv(fp, 'tweets_dataframe.csv',['id','text','user.favourites_count','user.id','lang','user.followers_count','user.verified','truncated']) #json, csv