def main(): x, y = load_datasets(["../datasets/sentiment_uci/yelp_labelled.txt"]) stopwords = set() with open('../stopwords.txt', 'r') as f: for w in f: stopwords.add(w.strip()) tok = TweetTokenizer() x = [remove_stopwords(tok.tokenize(s.lower()), stopwords) for s in x] x = np.array(x) accumulate = dict() folds = 10 for train_idx, test_idx in StratifiedKFold(y=y, n_folds=folds, shuffle=True): train_x, train_y = x[train_idx], y[train_idx] test_x, test_y = x[test_idx], y[test_idx] # train_x = [remove_stopwords(tok.tokenize(s), stopwords) for s in train_x] # test_x = [remove_stopwords(tok.tokenize(s), stopwords) for s in test_x] train_docs = [(sent, label) for sent, label in zip(train_x, train_y)] test_docs = [(sent, label) for sent, label in zip(test_x, test_y)] cls = SentimentAnalyzer() # train words_with_neg = cls.all_words([mark_negation(a) for a in train_x]) unigram_feats = cls.unigram_word_feats(words_with_neg) bigram_feats = cls.bigram_collocation_feats(train_x) cls.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats, handle_negation=True) cls.add_feat_extractor(extract_bigram_feats, bigrams=bigram_feats) training_set = cls.apply_features(train_docs, labeled=True) cls.train(NaiveBayesClassifier.train, training_set) # test & evaluate test_set = cls.apply_features(test_docs) for key, value in sorted(cls.evaluate(test_set).items()): print('\t{0}: {1}'.format(key, value)) accumulate.setdefault(key, 0.0) accumulate[key] += value print("Averages") for key, value in sorted(accumulate.items()): print('\tAverage {0}: {1}'.format(key, value / folds))
def run_sa_twitt(train, test): a = SentimentAnalyzer() tr = NaiveBayesClassifier.train all_words = [word for word in a.all_words(train)] # Add simple unigram word features unigram_feats = a.unigram_word_feats(all_words, top_n=1000) a.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Add bigram collocation features bigram_collocs_feats = a.bigram_collocation_feats( [tweet[0] for tweet in train_twitt], top_n=100, min_freq=12) a.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats) tr_set = a.apply_features(train) test_set = a.apply_features(test) #Training clf = a.train(tr, tr_set) res = a.evaluate(test_set) print(res)
def demo_tweets(trainer, n_instances=None, output=None): """ Train and test Naive Bayes classifier on 10000 tweets, tokenized using TweetTokenizer. Features are composed of: - 1000 most frequent unigrams - 100 top bigrams (using BigramAssocMeasures.pmi) :param trainer: `train` method of a classifier. :param n_instances: the number of total tweets that have to be used for training and testing. Tweets will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.tokenize import TweetTokenizer from nltk.sentiment import SentimentAnalyzer from nltk.corpus import twitter_samples, stopwords # Different customizations for the TweetTokenizer tokenizer = TweetTokenizer(preserve_case=False) # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) if n_instances is not None: n_instances = int(n_instances / 2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances) neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer) pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer) # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs + train_neg_docs testing_tweets = test_pos_docs + test_neg_docs sentim_analyzer = SentimentAnalyzer() # stopwords = stopwords.words('english') # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] all_words = [word for word in sentim_analyzer.all_words(training_tweets)] # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Add bigram collocation features bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats( [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12) sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats) training_set = sentim_analyzer.apply_features(training_tweets) test_set = sentim_analyzer.apply_features(testing_tweets) classifier = sentim_analyzer.train(trainer, training_set) # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) try: classifier.show_most_informative_features() except AttributeError: print( 'Your classifier does not provide a show_most_informative_features() method.' ) results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__, Tokenizer=tokenizer.__class__.__name__, Feats=extr, Results=results, Instances=n_instances)
training_tweets, testing_tweets = split_train_test(result) #x_train, x_test, y_train, y_test = train_test_split(result['tweet'], result['senti'], test_size=0.20, random_state=0) sentim_analyzer = SentimentAnalyzer() stopwords = stopwords.words('english') all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] print(all_words) # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Add bigram collocation features bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets],top_n=100, min_freq=12) sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats) training_set = sentim_analyzer.apply_features(training_tweets) test_set = sentim_analyzer.apply_features(testing_tweets) classifier = sentim_analyzer.train(trainer, training_set) # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) try: classifier.show_most_informative_features() except AttributeError: print('Your classifier does not provide a show_most_informative_features() method.') results = sentim_analyzer.evaluate(test_set)
def demo_tweets(trainer, n_instances=None, output=None): """ Train and test Naive Bayes classifier on 10000 tweets, tokenized using TweetTokenizer. Features are composed of: - 1000 most frequent unigrams - 100 top bigrams (using BigramAssocMeasures.pmi) :param trainer: `train` method of a classifier. :param n_instances: the number of total tweets that have to be used for training and testing. Tweets will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.tokenize import TweetTokenizer from nltk.sentiment import SentimentAnalyzer from nltk.corpus import twitter_samples, stopwords # Different customizations for the TweetTokenizer tokenizer = TweetTokenizer(preserve_case=False) # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) if n_instances is not None: n_instances = int(n_instances/2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances) neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer) pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer) # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs+train_neg_docs testing_tweets = test_pos_docs+test_neg_docs sentim_analyzer = SentimentAnalyzer() # stopwords = stopwords.words('english') # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] all_words = [word for word in sentim_analyzer.all_words(training_tweets)] # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Add bigram collocation features bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets], top_n=100, min_freq=12) sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats) training_set = sentim_analyzer.apply_features(training_tweets) test_set = sentim_analyzer.apply_features(testing_tweets) classifier = sentim_analyzer.train(trainer, training_set) # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) try: classifier.show_most_informative_features() except AttributeError: print('Your classifier does not provide a show_most_informative_features() method.') results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__, Tokenizer=tokenizer.__class__.__name__, Feats=extr, Results=results, Instances=n_instances)
def train_classifier(classifier, num_of_tweets, gram, lang, lemmas): sentim_analyzer = SentimentAnalyzer() print("num_of_tweets, gram, lang, lemmas_bool:") print(num_of_tweets, gram, lang, lemmas) training = [] testing = [] if lang == "rus": training = get_train_test("train.csv") testing = get_train_test("test.csv") if lang == "ger": training = get_train_test("train_de.csv") testing = get_train_test("test_de.csv") data = training + testing def removeStopWords(item): item[0] = delete_stop_words(lang, item[0]) return item data_neg = [] data_pos = [] for i in data: if i[1] == 'neg': data_neg.append(i) if i[1] == 'pos': data_pos.append(i) data_even = [] for i in range(len(data_neg)): data_even.append(data_neg[i]) data_even.append(data_pos[i]) training_data = data_even[:num_of_tweets] dict_1 = {} dict_1["Accuracy"] = 0 dict_1["Precision [pos]"] = 0 dict_1["Recall [pos]"] = 0 dict_1["F-measure [pos]"] = 0 dict_1["Precision [neg]"] = 0 dict_1["Recall [neg]"] = 0 dict_1["F-measure [neg]"] = 0 vocab = 0 unigram = 0 bigram = 0 for i in range(5): test = training_data[int(len(training_data) / 5) * i:int((len(training_data) / 5)) * (i + 1)] train = training_data[:int(len(training_data) / 5) * i] + training_data[ int((len(training_data) / 5)) * (i + 1):len( training_data)] train = list(map(removeStopWords, train)) test = list(map(removeStopWords, test)) # print(train) shuffle(train) shuffle(test) print("len(train+test):") print(len(train) + len(test)) print("train: pos, neg:") print(count_tags(train)) print("test: pos, neg:") print(count_tags(test)) vocabulary = sentim_analyzer.all_words(tokenize_set(train, lemmas, lang)) print("vocab len:") print(len(vocabulary)) vocab += len(vocabulary) # print("vocabulary[0]:") # print(vocabulary[0]) if gram == "unigram": unigram_features = sentim_analyzer.unigram_word_feats(vocabulary) print("unigram feats len:") print(len(unigram_features)) unigram += len(unigram_features) # print("unigram_features[0]:") # print(unigram_features[0]) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_features) if gram == "bigram": bigram_features = sentim_analyzer.bigram_collocation_feats(tokenize_set(train, lemmas, lang)) print("bigram feats len:") print(len(bigram_features)) bigram += len(bigram_features) # print("bigram_features[0]:") # print(bigram_features[0]) # print("bigram_features[5]:") # print(bigram_features[5]) sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_features) _train_X = sentim_analyzer.apply_features(tokenize_set(train, lemmas, lang), labeled=False) _train_Y = get_y(train) _test_X = sentim_analyzer.apply_features(tokenize_set(test, lemmas, lang), labeled=False) _test_Y = get_y(test) sentim_analyzer.train(classifier.train, list(zip(_train_X, _train_Y))) dict = sentim_analyzer.evaluate(list(zip(_test_X, _test_Y))) print(dict) dict_1["Accuracy"] += dict.get('Accuracy') dict_1["Precision [pos]"] += dict.get('Precision [pos]') dict_1["Recall [pos]"] += dict.get('Recall [pos]') dict_1["F-measure [pos]"] += dict.get('F-measure [pos]') dict_1["Precision [neg]"] += dict.get('Precision [neg]') dict_1["Recall [neg]"] += dict.get('Recall [neg]') dict_1["F-measure [neg]"] += dict.get('F-measure [neg]') print("Accuracy:") print(dict_1.get('Accuracy') / 5) print("Precision [pos]:") print(dict_1.get('Precision [pos]') / 5) print("Precision [neg]:") print(dict_1.get('Precision [neg]') / 5) print("F-measure [pos]:") print(dict_1.get('F-measure [pos]') / 5) print("F-measure [neg]:") print(dict_1.get('F-measure [neg]') / 5) print("Recall [pos]:") print(dict_1.get('Recall [pos]') / 5) print("Recall [neg]:") print(dict_1.get('Recall [neg]') / 5) print("vocab length: ") print(vocab / 5) if gram == "bigram": print("bigram features:") print(bigram / 5) if gram == "unigram": print("unigram features:") print(unigram / 5)