Пример #1
0
    def data_set(self):
        if self.n_tweets is not None:
            self.n_tweets = int(self.n_tweets/2)
        tokenizer = TweetTokenizer()
        fields = ['id', 'text']
        positive_json = twitter_samples.abspath("positive_tweets.json")
        positive_csv = 'positive_tweets.csv'
        json2csv_preprocess(positive_json,positive_csv,fields,limit=self.n_tweets)

        negative_json = twitter_samples.abspath("negative_tweets.json")
        negative_csv = 'negative_tweets.csv'
        json2csv_preprocess(negative_json, negative_csv, fields, limit=self.n_tweets)

        neg_docs = parse_tweets_set(negative_csv, label='neg')
        pos_docs = parse_tweets_set(positive_csv, label='pos')

            # We separately split subjective and objective instances to keep a balanced
            # uniform class distribution in both train and test sets.
        train_pos_docs, test_pos_docs = split_train_test(pos_docs)
        train_neg_docs, test_neg_docs = split_train_test(neg_docs)

        training_tweets = train_pos_docs+train_neg_docs
        testing_tweets = test_pos_docs+test_neg_docs

        training_tweets = train_pos_docs+train_neg_docs
        testing_tweets = test_pos_docs+test_neg_docs

        return training_tweets, testing_tweets
Пример #2
0
 def setUp(self):
     with open(twitter_samples.abspath(
             "tweets.20150430-223406.json")) as infile:
         self.infile = [next(infile) for x in range(100)]
     infile.close()
     self.msg = "Test and reference files are not the same"
     self.subdir = os.path.join(os.path.dirname(__file__), 'files')
Пример #3
0
def load_twitter():
    tokenizer = TweetTokenizer(preserve_case=False)
    fields = ['id', 'text']

    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = 'positive_tweets.csv'
    json2csv_preprocess(positive_json, positive_csv, fields, limit=None)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = 'negative_tweets.csv'
    json2csv_preprocess(negative_json, negative_csv, fields, limit=None)

    neg_docs = parse_tweets_set(negative_csv,
                                label='neg',
                                word_tokenizer=tokenizer)
    pos_docs = parse_tweets_set(positive_csv,
                                label='pos',
                                word_tokenizer=tokenizer)

    return pos_docs, neg_docs
Пример #4
0
def read_single_tweets(path):
    word_list = []
    if os.path.isfile(path):
        input_tweets = twitter_samples.abspath(os.path.abspath(path))
        output_tweets = os.path.join(
            os.path.dirname(path) + '_text',
            os.path.basename(path) + '.csv')
        os.makedirs(os.path.dirname(output_tweets), exist_ok=True)
        try:
            with open(input_tweets) as fp:
                json2csv(fp, output_tweets, ['text'])
            with open(output_tweets, 'r') as fp:
                reader = csv.DictReader(fp)
                for row in reader:
                    try:
                        tweet = row['text']
                        if detect(tweet) == 'en':
                            word_list.append(clean_and_tokenize(tweet))
                    except lang_detect_exception.LangDetectException:
                        continue
        except:
            print(path)
    return word_list
Пример #5
0
def demo_vader_tweets(n_instances=None, output=None):
    """
    Classify 10000 positive and negative tweets using Vader approach.

    :param n_instances: the number of total tweets that have to be classified.
    :param output: the output file where results have to be reported.
    """
    from collections import defaultdict
    from nltk.corpus import twitter_samples
    from nltk.sentiment import SentimentIntensityAnalyzer
    from nltk.metrics import (accuracy as eval_accuracy, precision as
                              eval_precision, recall as eval_recall, f_measure
                              as eval_f_measure)

    if n_instances is not None:
        n_instances = int(n_instances / 2)

    fields = ['id', 'text']
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = 'positive_tweets.csv'
    json2csv_preprocess(positive_json,
                        positive_csv,
                        fields,
                        strip_off_emoticons=False,
                        limit=n_instances)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = 'negative_tweets.csv'
    json2csv_preprocess(negative_json,
                        negative_csv,
                        fields,
                        strip_off_emoticons=False,
                        limit=n_instances)

    pos_docs = parse_tweets_set(positive_csv, label='pos')
    neg_docs = parse_tweets_set(negative_csv, label='neg')

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs + train_neg_docs
    testing_tweets = test_pos_docs + test_neg_docs

    vader_analyzer = SentimentIntensityAnalyzer()

    gold_results = defaultdict(set)
    test_results = defaultdict(set)
    acc_gold_results = []
    acc_test_results = []
    labels = set()
    num = 0
    for i, (text, label) in enumerate(testing_tweets):
        labels.add(label)
        gold_results[label].add(i)
        acc_gold_results.append(label)
        score = vader_analyzer.polarity_scores(text)['compound']
        if score > 0:
            observed = 'pos'
        else:
            observed = 'neg'
        num += 1
        acc_test_results.append(observed)
        test_results[observed].add(i)
    metrics_results = {}
    for label in labels:
        accuracy_score = eval_accuracy(acc_gold_results, acc_test_results)
        metrics_results['Accuracy'] = accuracy_score
        precision_score = eval_precision(gold_results[label],
                                         test_results[label])
        metrics_results['Precision [{0}]'.format(label)] = precision_score
        recall_score = eval_recall(gold_results[label], test_results[label])
        metrics_results['Recall [{0}]'.format(label)] = recall_score
        f_measure_score = eval_f_measure(gold_results[label],
                                         test_results[label])
        metrics_results['F-measure [{0}]'.format(label)] = f_measure_score

    for result in sorted(metrics_results):
        print('{0}: {1}'.format(result, metrics_results[result]))

    if output:
        output_markdown(output,
                        Approach='Vader',
                        Dataset='labeled_tweets',
                        Instances=n_instances,
                        Results=metrics_results)
Пример #6
0
def demo_tweets(trainer, n_instances=None, output=None):
    """
    Train and test Naive Bayes classifier on 10000 tweets, tokenized using
    TweetTokenizer.
    Features are composed of:
        - 1000 most frequent unigrams
        - 100 top bigrams (using BigramAssocMeasures.pmi)

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total tweets that have to be used for
        training and testing. Tweets will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.tokenize import TweetTokenizer
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import twitter_samples, stopwords

    # Different customizations for the TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False)
    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)

    if n_instances is not None:
        n_instances = int(n_instances / 2)

    fields = ['id', 'text']
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = 'positive_tweets.csv'
    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = 'negative_tweets.csv'
    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)

    neg_docs = parse_tweets_set(negative_csv,
                                label='neg',
                                word_tokenizer=tokenizer)
    pos_docs = parse_tweets_set(positive_csv,
                                label='pos',
                                word_tokenizer=tokenizer)

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs + train_neg_docs
    testing_tweets = test_pos_docs + test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    # stopwords = stopwords.words('english')
    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)

    # Add bigram collocation features
    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats(
        [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12)
    sentim_analyzer.add_feat_extractor(extract_bigram_feats,
                                       bigrams=bigram_collocs_feats)

    training_set = sentim_analyzer.apply_features(training_tweets)
    test_set = sentim_analyzer.apply_features(testing_tweets)

    classifier = sentim_analyzer.train(trainer, training_set)
    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print(
            'Your classifier does not provide a show_most_informative_features() method.'
        )
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output,
                        Dataset='labeled_tweets',
                        Classifier=type(classifier).__name__,
                        Tokenizer=tokenizer.__class__.__name__,
                        Feats=extr,
                        Results=results,
                        Instances=n_instances)
Пример #7
0
Файл: util.py Проект: DrDub/nltk
def demo_vader_tweets(n_instances=None, output=None):
    """
    Classify 10000 positive and negative tweets using Vader approach.

    :param n_instances: the number of total tweets that have to be classified.
    :param output: the output file where results have to be reported.
    """
    from collections import defaultdict
    from nltk.corpus import twitter_samples
    from nltk.sentiment import SentimentIntensityAnalyzer
    from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision,
        recall as eval_recall, f_measure as eval_f_measure)

    if n_instances is not None:
        n_instances = int(n_instances/2)

    fields = ['id', 'text']
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = 'positive_tweets.csv'
    json2csv_preprocess(positive_json, positive_csv, fields, strip_off_emoticons=False,
                        limit=n_instances)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = 'negative_tweets.csv'
    json2csv_preprocess(negative_json, negative_csv, fields, strip_off_emoticons=False,
                        limit=n_instances)

    pos_docs = parse_tweets_set(positive_csv, label='pos')
    neg_docs = parse_tweets_set(negative_csv, label='neg')

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs+train_neg_docs
    testing_tweets = test_pos_docs+test_neg_docs

    vader_analyzer = SentimentIntensityAnalyzer()

    gold_results = defaultdict(set)
    test_results = defaultdict(set)
    acc_gold_results = []
    acc_test_results = []
    labels = set()
    num = 0
    for i, (text, label) in enumerate(testing_tweets):
        labels.add(label)
        gold_results[label].add(i)
        acc_gold_results.append(label)
        score = vader_analyzer.polarity_scores(text)['compound']
        if score > 0:
            observed = 'pos'
        else:
            observed = 'neg'
        num += 1
        acc_test_results.append(observed)
        test_results[observed].add(i)
    metrics_results = {}
    for label in labels:
        accuracy_score = eval_accuracy(acc_gold_results,
            acc_test_results)
        metrics_results['Accuracy'] = accuracy_score
        precision_score = eval_precision(gold_results[label],
            test_results[label])
        metrics_results['Precision [{0}]'.format(label)] = precision_score
        recall_score = eval_recall(gold_results[label],
            test_results[label])
        metrics_results['Recall [{0}]'.format(label)] = recall_score
        f_measure_score = eval_f_measure(gold_results[label],
            test_results[label])
        metrics_results['F-measure [{0}]'.format(label)] = f_measure_score

    for result in sorted(metrics_results):
            print('{0}: {1}'.format(result, metrics_results[result]))

    if output:
        output_markdown(output, Approach='Vader', Dataset='labeled_tweets',
            Instances=n_instances, Results=metrics_results)
Пример #8
0
Файл: util.py Проект: DrDub/nltk
def demo_tweets(trainer, n_instances=None, output=None):
    """
    Train and test Naive Bayes classifier on 10000 tweets, tokenized using
    TweetTokenizer.
    Features are composed of:
        - 1000 most frequent unigrams
        - 100 top bigrams (using BigramAssocMeasures.pmi)

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total tweets that have to be used for
        training and testing. Tweets will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.tokenize import TweetTokenizer
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import twitter_samples, stopwords

    # Different customizations for the TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False)
    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)

    if n_instances is not None:
        n_instances = int(n_instances/2)

    fields = ['id', 'text']
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = 'positive_tweets.csv'
    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = 'negative_tweets.csv'
    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)

    neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer)
    pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer)

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs+train_neg_docs
    testing_tweets = test_pos_docs+test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    # stopwords = stopwords.words('english')
    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Add bigram collocation features
    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets],
        top_n=100, min_freq=12)
    sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats)

    training_set = sentim_analyzer.apply_features(training_tweets)
    test_set = sentim_analyzer.apply_features(testing_tweets)

    classifier = sentim_analyzer.train(trainer, training_set)
    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__,
                        Tokenizer=tokenizer.__class__.__name__, Feats=extr,
                        Results=results, Instances=n_instances)
Пример #9
0
def infile():
    with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile:
        return [next(infile) for x in range(100)]
Пример #10
0
# -*- coding: utf-8 -*-
'''
Created on 2017-02-11 16:07:41
提取Twitter数据实体
@author: zhoujiagen
'''
# 字段的说明:
# https://dev.twitter.com/overview/api/tweets
# https://dev.twitter.com/overview/api/entities
# https://dev.twitter.com/overview/api/users

from nltk.corpus import twitter_samples
# from nltk.twitter.common import json2csv
import json

input_file = twitter_samples.abspath("tweets.20150430-223406.json")


# 准备数据
#
# 图数据模型
# 节点: User, Word, URL, Hashtag
# 关系: mentions_user, retweets_user, follows_user, mentions_hashtag, uses_word, mentions_url
# user.screen_name(user), user.name, user.location
# text
# entities.hashtags
# entities.user_mentions(mentions)
# entities.urls
# retweeted
def extract_tweet_info(tweet):
    result = {}
Пример #11
0
neg_output_file = "neg_tweets_list.txt"


def clean_up_files(filename):
    data = list()
    with open(filename, 'r') as f:
        for line in f:
            if len(line) > 1:
                data.append(line)

    with open(filename, 'w') as f:
        for line in data:
            f.write(line)


pos_tweets_file = twitter_samples.abspath(twitter_samples.fileids()[1])
pos_tweets_output = open(pos_output_file, 'w+')
with open(pos_tweets_file, 'r') as tf:
    for line in tf:
        x = json.loads(line)
        tweet = x['text'].encode('UTF-8')
        if '\n' not in tweet:
            tweet += '\n'
        if (len(tweet) > 4):
            pos_tweets_output.write(tweet)

pos_tweets_output.close()

neg_tweets_file = twitter_samples.abspath(twitter_samples.fileids()[0])
neg_tweets_output = open(neg_output_file, 'w+')
with open(neg_tweets_file, 'r') as tf:
Пример #12
0
oauth = credsfromfile()
n = 10  # 設定拿取 tweets 資料則數
username = '******'

# Query
client = Query(**oauth)  # 歷史資料
client.register(TweetWriter())  # 寫入
client.user_tweets(username, n)  # 拿取 tweets 資料(n則)

'''
使用 json2csv 存取 tweets 資料 (text欄位)
input_file 的 abspath 需參考上述 Query 寫入資料的路徑做修改
'''

input_file = twitter_samples.abspath('/Users/youngmihuang/twitter-files/tweets.20180726-155316.json')
with open(input_file) as fp:
    json2csv(fp, 'tweets_text.csv', ['text'])

# 讀取
data = pd.read_csv('tweets_text.csv')
for line in data.text:
    print('Trump tweets content: ')
    print(line)

# 斷詞
tokenized = twitter_samples.tokenized(input_file)
for tok in tokenized[:5]:
    print('tokenized: ')
    print(tok)
 def setUp(self):
     with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile:
         self.infile = [next(infile) for x in range(100)]
     infile.close()
     self.msg = "Test and reference files are not the same"
     self.subdir = os.path.join(os.path.dirname(__file__), 'files')
Пример #14
0
from nltk.corpus import twitter_samples
from nltk.twitter.common import json2csv

#corpus twitter_sample tweets ~20k
jsonfile = twitter_samples.fileids()[-1]

#absolute path for the file: #input_file = os.path.abspath(jsonfile)=>returns virtualenv file path
input_file = twitter_samples.abspath(jsonfile) #returns system /usr/share/ path

#with open(input_file) as fp:
	#json2csv(fp,'tweets_text.csv',['text']) #json2csv(pointer, nameoffile, [feature1,feature2,feature3])

#think about the attributes to be imported, convert to panda, make a dataframe, apply stemming to tweet texts, save them. 
with open(input_file) as fp:
	json2csv(fp, 'tweets_dataframe.csv',['id','text','user.favourites_count','user.id','lang','user.followers_count','user.verified','truncated'])
#json, csv