def generate_model(self):
        print("Gathering and processing tweets...")
        # Shuffle list of username-label tuples
        tuple_list = usermapping.data_tuples.items()

        # Split and grab tweets for users
        results = utils.flatten([ self.fetch_data(t)
                                  for t in tuple_list ])
         
        # TODO: Cross-validation generation
        trn_ratio = int(len(results) * 0.85)
        shuffle(results)
        print(len(results))
        print(trn_ratio)
        train = results[:trn_ratio]
        test = results[trn_ratio:]

        # Instantiate and train classifier
        print("Training...")
        cl = NaiveBayesClassifier(train)
        cl.train()
        
        # Save model
        print("Saving model...")
        utils.save_model(cl)

        # Classify test
        print("Testing...")
        print("Accuracy: {0}".format(cl.accuracy(test)))
        return cl
示例#2
0
def train_NLTK_NB(labeled_list_tr):
    all_words = set(word.lower() for passage in labeled_list_tr
                    for word in word_tokenize(passage[0]))
    train_set = [({word: (word in word_tokenize(x[0]))
                   for word in all_words}, x[1]) for x in labeled_list_tr]
    classifier = NaiveBayesClassifier.train(train_set)
    return classifier
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive")
                    for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                    for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:71601]
test_data = dataset[71601:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

positiveNB2019 = 0
negativeNB2019 = 0
positiveNB2017 = 0
negativeNB2017 = 0
positiveNB2018 = 0
negativeNB2018 = 0

for x in data2019['text']:
    custom_tokens = remove_noise(word_tokenize(str(x)))
    classification = classifier.classify(
示例#4
0
    total = accuracy = float(len(test_data))
    for data in test_data:
        if classify_dataset(data[0]) != data[1]:
            accuracy -= 1
        else:
          print data, classify_dataset(data[0]), data[1]

    print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy))
    final = accuracy / total * 100
    return final

# Create training and testing data
sen = c[3]
emo = c[0]
l = len(c[3])
limit = (9*l)//10
sente = c[2]
Data = create_data(sen[:limit], emo[:limit])
test_data = create_test(sente[limit:], emo[limit:])

# extract the word features out from the training data
word_features = get_word_features(\
                    get_words_in_dataset(Data))

# get the training set and train the Naive Bayes Classifier
training_set = nltk.classify.util.apply_features(extract_features, Data)
classifier = NaiveBayesClassifier.train(training_set)

Naive_accu = get_accuracy(test_data, classifier)

print "Accuracy using Naive Bayes Component  ", Naive_accu, "%"