def createClassifier(ignoreTweets=False): neg_ids = movie_reviews.fileids('neg') pos_ids = movie_reviews.fileids('pos') neg_sents = [(extractWords(movie_reviews.words(fileids=[f])), 'neg') for f in neg_ids] pos_sents = [(extractWords(movie_reviews.words(fileids=[f])), 'pos') for f in pos_ids] #if you dont want to process all tweets, just call : (neg_pols, pos_pols) = getSentPolarities() (neg_tweets, pos_tweets) = getTweets(ignoreTweets) neg_sents = neg_sents + neg_tweets + neg_pols pos_sents = pos_sents + pos_tweets + pos_pols trainsizeneg = int(0.75 * len(neg_sents)) trainsizepos = int(0.75 * len(pos_sents)) all_train = neg_sents[:trainsizeneg] + pos_sents[:trainsizepos] all_test = neg_sents[trainsizeneg:] + pos_sents[trainsizepos:] # train size = 1500, test size = 500 s_analyzer = SentimentAnalyzer() classifier = NaiveBayesClassifier.train(all_train) print accuracy(classifier, all_test) #classifier.show_most_informative_features() return classifier
def classify(inputdir): #filenames = os.listdir('d:\\shir\\') filenames = os.listdir(inputdir) feat_set = [] sets = [] for name in filenames: # print name lineno=0 path = os.path.join(inputdir, name) sense = name.split('\\')[-1].split('.')[0] print 'training', sense file = codecs.open(path, 'r', 'utf-8') allwords = [] for line in file: if len(line.split())>2: lineno+=1 line = line.strip() words=[] tags=[] tokens = line.split() for item in tokens: if len(item.split('\\'))==2: word=item.split('\\')[0] tag= item.split('\\')[1] words.append(word) tags.append(tag) allwords.append(word) feat_set.append((bag_of_words(line),sense)) #feat_set.append((get_feature2(line),sense)) else: words=[] tags=[] file.close() random.shuffle(feat_set) random.shuffle(feat_set) #random.shuffle(feat_set) train_data = train_feats(feat_set) test_data = test_feats(feat_set) #classifier= MaxentClassifier.train(train_data) nb_classifier = NaiveBayesClassifier.train(train_data) dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30) # pickle.dump(classifier, classifier_save_file) entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=1, min_lldelta=0.5) print "nb accuracy "+ str(accuracy(nb_classifier, test_data) * 100) print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100) print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100) mv_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, entropy_classifier) print "max vote accuracy "+ str(accuracy(mv_classifier, test_data) * 100)
def train(cleanedDataCollection, tagPool): posSamples = [] negSamples = [] featuresets = [(extractFeatures(d,tagPool), c) for (d,c) in cleanedDataCollection] for sample in featuresets: if sample[1] == "trash": negSamples.append(sample) else: posSamples.append(sample) train_set = negSamples[10:]+posSamples[10:] test_set = negSamples[:10]+posSamples[:10] # classifier = nltk.NaiveBayesClassifier.train(train_set) # print(nltk.classify.accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) # return classifier sk_classifier = SklearnClassifier(MultinomialNB()) sk_classifier.train(train_set) print "accuracy is: %s" % (accuracy(sk_classifier, test_set)) precision, recall, fMeasure = precision_recall_fmeasure(sk_classifier, test_set, "useful") print "precision is: %s" % (precision) print "recall is: %s" % (recall) print "F-measure is: %s" % (fMeasure) return sk_classifier
def main(): results = {'Topic': [], 'Precision': [], 'Recall': [], 'F-measure': []} print('\nPreparing data...') (train_set, test_set) = get_train_test_sets('data/content') print('\nNB classifier training...') classifier = NaiveBayesClassifier.train(train_set) print('NB classifier is trained with {}% accuracy'.format( round(accuracy(classifier, test_set) * 100, 1))) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) for topic in topics: results['Topic'].append(topic) results['Precision'].append( round(precision(refsets[topic], testsets[topic]) * 100, 1)) results['Recall'].append( round(recall(refsets[topic], testsets[topic]) * 100, 1)) results['F-measure'].append( round(f_measure(refsets[topic], testsets[topic]) * 100, 1)) del classifier, train_set, test_set, refsets, testsets gc.collect() print(results)
def rte_classifier(trainer, features=rte_features): """ Classify RTEPairs """ train = [(pair, pair.value) for pair in nltk.corpus.rte.pairs( ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])] test = [(pair, pair.value) for pair in nltk.corpus.rte.pairs( ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])] # Train up a classifier. print('Training classifier...') classifier = trainer([(features(pair), label) for (pair, label) in train]) # Run the classifier on the test data. print('Testing classifier...') acc = accuracy(classifier, [(features(pair), label) for (pair, label) in test]) print('Accuracy: %6.4f' % acc) test_label = [label for (pair, label) in test] result = [classifier.classify(features(pair)) for (pair, label) in test] print(metrics.accuracy_score(test_label, result)) print(metrics.classification_report(test_label, result)) # Return the classifier return classifier
def train(self): print 'Classifier Training in progress....' poscutoff = len(self.positiveFeatures) negcutoff = len(self.negativeFeatures) print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff) trainfeats = self.positiveFeatures[:poscutoff] + self.negativeFeatures[:negcutoff] testfeats = self.test() print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) self.classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', accuracy(self.classifier, testfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = self.classifier.classify(feats) #print label, observed testsets[observed].add(i) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
def model_test(classifier, test_features): print('Model Accuracy: {0}'.format(accuracy(classifier, test_features))) precisions, recalls, f_measure, conf_matrix = get_precision_recall_fmeasure_conf_matrix(classifier, test_features) print('Precisions: {0}'.format(precisions)) print('Recalls: {0}'.format(recalls)) print('F-Measure: {0}'.format(f_measure)) print('Confusion Matrix: {0}'.format(conf_matrix))
def train_with_movie_db(self): """ Training possible with movie reviews - this does not yield particularly good results """ self.use_movie_reviews = True negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])), "negative") for f in negids] posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])), "positive") for f in posids] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats))) self.classifier = NaiveBayesClassifier.train(trainfeats) DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats))) DLOG(self.classifier.show_most_informative_features())
def rte_classifier(): train_set = rte_corpus.pairs( ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']) test_set = rte_corpus.pairs(['rte1_test.xml']) featurized_train_set = rte_featurize(train_set, True) featurized_test_set = rte_featurize(test_set, False, test_id=0) print('Training classifier...') svm = SklearnClassifier(LinearSVC()) clf_svm = svm.train(featurized_train_set) # clf_nb = nltk.NaiveBayesClassifier.train(featurized_train_set) # clf_gis = MaxentClassifier.train(featurized_train_set, 'GIS') # clf_iis = MaxentClassifier.train(featurized_train_set, 'IIS') # clf_dt = SklearnClassifier(RandomForestClassifier(random_state=0)).train(featurized_train_set) # clf_dt = DecisionTreeClassifier.train(featurized_train_set) print('Testing classifier...') # acc = m_accuracy(clf, featurized_test_set, test_set) # acc_dt = accuracy(clf_dt, featurized_test_set) # acc_gis = accuracy(clf_gis, featurized_test_set) # acc_iis = accuracy(clf_iis, featurized_test_set) acc_svm = accuracy(clf_svm, featurized_test_set) # acc_nb = accuracy(clf_nb, featurized_test_set) # print('rf Accuracy: %8.4f' % acc_dt) print('svm Accuracy: %8.4f' % acc_svm) # print('nb Accuracy: %8.4f' % acc_nb) # print('gis Accuracy: %8.4f' % acc_gis) # print('iis Accuracy: %8.4f' % acc_iis) print '==================================='
def run_classifier_tests(classifier): testfiles = [{ 'F': 'C:/research/HIT_QIWEI/data/clean/label/F_r.txt' }, { 'H': 'C:/research/HIT_QIWEI/data/clean/label/H_r.txt' }, { 'O': 'C:/research/HIT_QIWEI/data/clean/label/O_r.txt' }, { 'P': 'C:/research/HIT_QIWEI/data/clean/label/P_r.txt' }, { 'S': 'C:/research/HIT_QIWEI/data/clean/label/S_r.txt' }] #testfiles = [{'performance': 'http://en.wikipedia.org/wiki/Performance_measurement'}, # {'resource': 'http://en.wikipedia.org/wiki/Resource_management'}, # {'risk': 'http://en.wikipedia.org/wiki/Risk_management'}, # {'strategic': 'http://en.wikipedia.org/wiki/Strategic_alignment'}, # {'value': 'http://en.wikipedia.org/wiki/Val_IT'},] testfeats = [] for file in testfiles: for sense, loc in file.iteritems(): for line in open(loc).read(): testfeats = testfeats + create_training_dict(line, sense) acc = accuracy(classifier, testfeats) * 100 print 'accuracy: %.2f%%' % acc
def create_and_evaluate_classifier_10_fold(features): """ Uses 10 fold cross validation to create a classifier with naive bayes for the given features. Evaluates the created classifier afterwards. Results are printed to the console :param features: List of features """ k_fold_validator = KFold(n_splits=10, shuffle=False) accuracies = [] counter = 1 for train_index, test_index in k_fold_validator.split(features): # print str(counter * 10) + '% cross-validation' training_features = [features[i] for i in train_index] test_features = [features[i] for i in test_index] classifier = NaiveBayesClassifier.train(training_features) accuracy = util.accuracy(classifier, test_features) accuracies.append(accuracy) counter = counter + 1 print 'Accuracy:', sum(accuracies) / float(len(accuracies)), '\n'
def rte_classifier(algorithm, sample_N=None): from nltk.corpus import rte as rte_corpus train_set = rte_corpus.pairs( ["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"]) test_set = rte_corpus.pairs( ["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"]) if sample_N is not None: train_set = train_set[:sample_N] test_set = test_set[:sample_N] featurized_train_set = rte_featurize(train_set) featurized_test_set = rte_featurize(test_set) # Train the classifier print("Training classifier...") if algorithm in ["megam"]: # MEGAM based algorithms. clf = MaxentClassifier.train(featurized_train_set, algorithm) elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm clf = MaxentClassifier.train(featurized_train_set, algorithm) else: err_msg = str("RTEClassifier only supports these algorithms:\n " "'megam', 'GIS', 'IIS'.\n") raise Exception(err_msg) print("Testing classifier...") acc = accuracy(clf, featurized_test_set) print("Accuracy: %6.4f" % acc) return clf
def train(cleanedDataCollection, tagPool): posSamples = [] negSamples = [] featuresets = [(extractFeatures(d, tagPool), c) for (d, c) in cleanedDataCollection] for sample in featuresets: if sample[1] == "trash": negSamples.append(sample) else: posSamples.append(sample) train_set = negSamples[10:] + posSamples[10:] test_set = negSamples[:10] + posSamples[:10] # classifier = nltk.NaiveBayesClassifier.train(train_set) # print(nltk.classify.accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) # return classifier sk_classifier = SklearnClassifier(MultinomialNB()) sk_classifier.train(train_set) print "accuracy is: %s" % (accuracy(sk_classifier, test_set)) precision, recall, fMeasure = precision_recall_fmeasure( sk_classifier, test_set, "useful") print "precision is: %s" % (precision) print "recall is: %s" % (recall) print "F-measure is: %s" % (fMeasure) return sk_classifier
def rte_classifier(algorithm): from nltk.corpus import rte as rte_corpus train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']) test_set = rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']) featurized_train_set = rte_featurize(train_set) featurized_test_set = rte_featurize(test_set) # Train the classifier print('Training classifier...') if algorithm in ['megam', 'BFGS']: # MEGAM based algorithms. # Ensure that MEGAM is configured first. check_megam_config() clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm) elif algorithm in ['GIS', 'IIS']: # Use default GIS/IIS MaxEnt algorithm clf = MaxentClassifier.train(featurized_train_set, algorithm) else: err_msg = str( "RTEClassifier only supports these algorithms:\n " "'megam', 'BFGS', 'GIS', 'IIS'.\n" ) raise Exception(err_msg) print('Testing classifier...') acc = accuracy(clf, featurized_test_set) print('Accuracy: %6.4f' % acc) return clf
def main(): ''' Main function of the boilerplate code is the entry point of the 'chitragoopt' executable script (defined in setup.py). Use doctests, those are very helpful. >>> main() Hello >>> 2 + 2 4 ''' lfeats = label_feats_from_corpus(movie_reviews) train_feats, test_feats = split_label_feats(lfeats, split=0.75) train_feats, test_feats = split_label_feats(lfeats, split=0.75) # nb_classifier = NaiveBayesClassifier.train(train_feats) print(sys.argv[1].split()) negfeat = bag_of_words(sys.argv[1].split()) f = open('my_classifier.pickle') nb_classifier = pickle.load(f) f.close() print(accuracy(nb_classifier, test_feats)) print(nb_classifier.classify(negfeat)) for x in range(0, 50): print(nb_classifier.classify(negfeat))
def rte_classifier(algorithm): from nltk.corpus import rte as rte_corpus train_set = rte_corpus.pairs( ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']) test_set = rte_corpus.pairs( ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']) featurized_train_set = rte_featurize(train_set) featurized_test_set = rte_featurize(test_set) # Train the classifier print('Training classifier...') if algorithm in ['megam', 'BFGS']: # MEGAM based algorithms. # Ensure that MEGAM is configured first. check_megam_config() clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm) elif algorithm in ['GIS', 'IIS']: # Use default GIS/IIS MaxEnt algorithm clf = MaxentClassifier.train(featurized_train_set, algorithm) else: err_msg = str("RTEClassifier only supports these algorithms:\n " "'megam', 'BFGS', 'GIS', 'IIS'.\n") raise Exception(err_msg) print('Testing classifier...') acc = accuracy(clf, featurized_test_set) print('Accuracy: %6.4f' % acc) return clf
def train(self): """Trains new sentimental analyzer model. :return: """ data = self._get_train_data() percentage_to_train = 0.9 neg_feats = [(word_feats(tokenize_clean_text(tweet[1])), self.NEGATIVE) for tweet in data if tweet[0] == self.NEGATIVE] pos_feats = [(word_feats(tokenize_clean_text(tweet[1])), self.POSITIVE) for tweet in data if tweet[0] == self.POSITIVE] del data neg_cutoff = round(len(neg_feats) * percentage_to_train) pos_cutoff = round(len(pos_feats) * percentage_to_train) train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff] test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:] # Train Classifier. print('train on %d instances, test on %d instances' % (len(train_feats), len(test_feats))) self._classifier = NaiveBayesClassifier.train(train_feats) print('accuracy: ', accuracy(self._classifier, test_feats)) self._classifier.show_most_informative_features()
def test(self): if not hasattr(self, 'test_sets'): self._split_features() if not hasattr(self, 'classifiers'): self.train() result = {} for tag, classifier in self.classifiers.iteritems(): result[tag] = accuracy(classifier, self.test_sets[tag]) return result
def results(train, query_data, query_no_label, query_labels): print '\nCalculating final results...' megam_classifier = MaxentClassifier.train(train, 'megam') # build and train the maxent classifier accu = accuracy(megam_classifier, query_data) # calculate the classification accuracy predicted = megam_classifier.classify_many(query_no_label) # get a list of predicted labels cm = confusion_matrix(query_labels, predicted) # build confusion matrix return accu, cm
def implementMethods(self,sents,labelsData,clsent): labelwords=[] k=0 cl=self.featureList(sents,labelsData) tr,te=self.setSplit(cl) nb_classifier = NaiveBayesClassifier.train(tr) print('Accuracy = '+str(accuracy(nb_classifier, te)*100)+'%') return nb_classifier
def evaluate_accuracy(self): '''Evaluate accuracy given a classifer model''' accuracies = [] lfeats = self.label_feats_from_corpus() for i in range(1, 10): train_feats, test_feats, nb_classifier = self\ .__get_elements_for_classification(lfeats, train_number=i, classifying=False) accuracies.append(accuracy(nb_classifier, test_feats)) return sum(accuracies)/len(accuracies)
def RunBayesNetwork(self, type_of_Feature_extractor): #Bayes Network classifier, return accuracy if type_of_Feature_extractor == 1: #Format the positive and negative separately formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor1) formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor1) #Same again but for the testing data formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor1) formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor1) elif type_of_Feature_extractor == 2: #Format the positive and negative separately formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor2) formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor2) #Same again but for the testing data formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor2) formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor2) elif type_of_Feature_extractor == 3: #Format the positive and negative separately formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor3) formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor3) #Same again but for the testing data formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor3) formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor3) elif type_of_Feature_extractor == 4: #Format the positive and negative separately formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor4) formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor4) #Same again but for the testing data formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor4) formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor4) else: #Format the positive and negative separately formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos") formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg") #Same again but for the testing data formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos") formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg") #Combine them formatted_training_data = formatted_pos_training + formatted_neg_training #Combine them formatted_testing_data = formatted_pos_testing + formatted_neg_testing #Train on a list of reviews nb_classifier = NaiveBayesClassifier.train(formatted_training_data) #Print the features that the NB classifier found to be most important in making classifications nb_classifier.show_most_informative_features() #Test on another list of reviews accuracy_ = accuracy(nb_classifier, formatted_testing_data) return accuracy_
def evaluate_features(feature_extractor, N, only_acc=False): from nltk.corpus import movie_reviews from nltk.classify import NaiveBayesClassifier as naive from nltk.classify.util import accuracy from nltk.metrics import precision, recall, f_measure from sys import stdout negative = movie_reviews.fileids('neg') positive = movie_reviews.fileids('pos') negfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])), 'neg') for f in negative] posfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])), 'pos') for f in positive] negtrain, negtest = stratifiedSamples(negfeats, N) postrain, postest = stratifiedSamples(posfeats, N) trainfeats = negtrain + postrain testfeats = negtest + postest classifier = naive.train(trainfeats) if only_acc: return accuracy(classifier, testfeats) print 'accuracy: {}'.format(accuracy(classifier, testfeats)) # Precision, Recall, F-measure from collections import defaultdict refsets = defaultdict(set) testsets = defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'pos precision:', precision(refsets['pos'], testsets['pos']) print 'pos recall:', recall(refsets['pos'], testsets['pos']) print 'pos F-measure:', f_measure(refsets['pos'], testsets['pos']) print 'neg precision:', precision(refsets['neg'], testsets['neg']) print 'neg recall:', recall(refsets['neg'], testsets['neg']) print 'neg F-measure:', f_measure(refsets['neg'], testsets['neg']) stdout.flush() classifier.show_most_informative_features() return classifier
def dt_classify(filename): raw_sample_stream = get_samples_stream(filename) all_samples = list( binary_bow_feature(raw_sample_stream) ) # filter out two classes of outliers # these two categories contain too few examples, so the word frequency in these two categories # cannot reflect the true probability # all_samples = [(features,aspect) for features,aspect in all_samples if aspect != common.AspectNothing and aspect != common.AspectBusiness] test_sample_ratio = 0.25 train_samples,test_samples = split_samples(all_samples,test_sample_ratio) print "training set has {} samples, test set has {} samples".format(len(train_samples),len(test_samples)) classifier = DecisionTreeClassifier.train(train_samples,binary=True, depth_cutoff=15,verbose=True) print "training completes" print "training accuracy: {}".format(accuracy(classifier,train_samples)) print "test accuracy: {}".format(accuracy(classifier,test_samples)) return classifier
def find_scores(): #Text formatting to classify def format_text(text): return ({word: True for word in nltk.word_tokenize(text)}) #Load positive categorized text pos = [] with open("./pos.txt", encoding='ISO-8859-1') as f: for i in f: pos.append([ format_text(i.encode("utf-8").decode("unicode-escape")), 'positive' ]) #Load negative categorized text neg = [] with open("./neg.txt", encoding='ISO-8859-1') as f: for i in f: neg.append([ format_text(i.encode("utf-8").decode("unicode-escape")), 'negative' ]) #Load negative categorized text neu = [] with open("./neu.txt", encoding='ISO-8859-1') as f: for i in f: neu.append([ format_text(i.encode("utf-8").decode("unicode-escape")), 'neutre' ]) #Split data into training(80%) and testing(20%) sets training_set = pos[:int((.80) * len(pos))] + neg[:int( (.80) * len(neg))] + neu[:int((.80) * len(neu))] test_set = pos[int((.80) * len(pos)):] + neg[int( (.80) * len(neg)):] + neu[int((.80) * len(neu)):] #Training classifier classifier = NaiveBayesClassifier.train(training_set) #Calculate scores trueset = collections.defaultdict(set) testset = collections.defaultdict(set) #Test all test-set items using defined classifier for i, (text, label) in enumerate(test_set): trueset[label].add(i) result = classifier.classify(text) testset[result].add(i) #accurays return accuracy(classifier, test_set), f_measure( trueset['positive'], testset['negative']), f_measure( testset['negative'], trueset['positive']), f_measure( testset['neutre'], trueset['positive']), f_measure( testset['positive'], trueset['neutre']), f_measure( testset['negative'], trueset['neutre']), f_measure(testset['neutre'], trueset['negative'])
def cross_validate(): training_set = load_training_set() random.shuffle(training_set) average = 0 cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None) for traincv, evalcv in cv: classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv) - 1]]) acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]]) print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1] print 'Accuracy: %4.2f' % acc average += acc print 'Average accuracy: %4.2f' % (average / 10)
def classifyReviews(): ''' Perform sentiment classification on movie reviews ''' # Read the data from the file data = pd.read_csv("data/movieReviews.csv") # get the text of the positive and negative reviews only. # positive and negative will be lists of strings # For now we use only very positive and very negative reviews. positive = getReviews(data, 4) negative = getReviews(data, 0) # Split each data set into training and testing sets. # You have to write the function splitTrainTest (posTrainText, posTestText) = splitTrainTest(positive, 0.8) (negTrainText, negTestText) = splitTrainTest(negative, 0.8) # Format the data to be passed to the classifier. # You have to write the formatForClassifer function posTrain = formatForClassifier(posTrainText, 'pos') negTrain = formatForClassifier(negTrainText, 'neg') # Create the training set by appending the pos and neg training examples training = posTrain + negTrain # Format the testing data for use with the classifier posTest = formatForClassifier(posTestText, 'pos') negTest = formatForClassifier(negTestText, 'neg') # Create the test set test = posTest + negTest # Train a Naive Bayes Classifier # Uncomment the next line once the code above is working classifier = NaiveBayesClassifier.train(training) # Uncomment the next two lines once everything above is working print("Accuracy of the classifier is: " + str(accuracy(classifier, test))) print("Accuracy of the positive is: " + str(accuracy(classifier, posTest))) print("Accuracy of the negative is: " + str(accuracy(classifier, negTest))) classifier.show_most_informative_features()
def naivebayes_classify(filename,filter_small_category): raw_sample_stream = get_samples_stream(filename) all_samples = list( binary_bow_feature(raw_sample_stream) ) if filter_small_category: # filter out two classes of outliers # these two categories contain too few examples, so the word frequency in these two categories # cannot reflect the true probability all_samples = [(features,aspect) for features,aspect in all_samples if aspect != common.AspectNothing and aspect != common.AspectBusiness] test_sample_ratio = 0.25 train_samples,test_samples = split_samples(all_samples,test_sample_ratio) print "training set has {} samples, test set has {} samples".format(len(train_samples),len(test_samples)) classifier = NaiveBayesClassifier.train(train_samples) print "training completes" print "########## training accuracy: {}".format(accuracy(classifier,train_samples)) print "########## test accuracy: {}".format(accuracy(classifier,test_samples)) classifier.show_most_informative_features(n=10) return classifier
def run_classifier_tests(classifier): testfiles = [{'traffic': 'traffic-training.txt'}, {'useless': 'useless-training.txt'}] testfeats = [] for file in testfiles: for sense, loc in file.iteritems(): for line in open(loc, 'r'): testfeats = testfeats + create_training_dict(line, sense) acc = accuracy(classifier, testfeats) * 100 print 'accuracy: %.2f%%' % acc sys.exit()
def run_classifier_tests(classifier): testfiles = [{'fruit': 'http://litfuel.net/plush/files/disambiguation/apple-fruit-training.txt'}, {'company': 'http://litfuel.net/plush/files/disambiguation/apple-company-training.txt'}] testfeats = [] for file in testfiles: for sense, loc in file.iteritems(): for line in urllib2.urlopen(loc): testfeats = testfeats + create_training_dict(line, sense) acc = accuracy(classifier, testfeats) * 100 print 'accuracy: %.2f%%' % acc sys.exit()
def implementMethods(sents,labelsData,clsent): labelwords=[] k=0 for sent in sents: labelwords.append((labelsData[k],nltk.tokenize.word_tokenize(sent))) k=k+1 high_info_words=set(high_information_words(labelwords)) feat_det=lambda words:bag_of_words_in_set(words,high_info_words) cl=featureList(sents,labelsData,feature_detector=feat_det) tr,te=setSplit(cl) nb_classifier = NaiveBayesClassifier.train(tr) print('Accuracy = '+str(accuracy(nb_classifier, te)*100)+'%') return nb_classifier
def run_classifier_tests(classifier): testfiles = [{ 'traffic': 'traffic-training.txt' }, { 'useless': 'useless-training.txt' }] testfeats = [] for file in testfiles: for sense, loc in file.iteritems(): for line in open(loc, 'r'): testfeats = testfeats + create_training_dict(line, sense) acc = accuracy(classifier, testfeats) * 100 print 'accuracy: %.2f%%' % acc sys.exit()
def random_test(iterations, haiku_labeled, short_labeled, corpus_length): import random from nltk.classify import NaiveBayesClassifier #import directly from .py file while in appropriate directory (then use updated function, which returns features as list) from nltk.classify.util import accuracy random_scores = [] #need to mash the corpora together here, and then send the newly labeled corpora to the "for" loop #switch labels for half of each corpora haiku_a = haiku_labeled[0:int(len(haiku_labeled) / 2)] haiku_b = haiku_labeled[int(len(haiku_labeled) / 2):] for i in haiku_a: i[1][1] = 'not-haiku' poetry_a = short_labeled[0:int(len(short_labeled) / 2)] poetry_b = short_labeled[int(len(short_labeled) / 2):] for i in poetry_a: i[1][1] = 'haiku' #create new corpora based on these false labels haiku_sample = haiku_b + poetry_a #i.e., real haiku and poetry labeled as haiku poetry_sample = haiku_a + poetry_b #now run the classification for i in range(iterations): #for i,k in zip(haiku_random[0::2], haiku_random[1::2]): #iterates through every other item # i[1][1] = 'not-haiku' #for i,k in zip(poetry_random[0::2], poetry_random[1::2]): # i[1][1] = 'haiku' haiku_random = [] poetry_random = [] haiku_random = random.sample(haiku_sample, corpus_length) poetry_random = random.sample(poetry_sample, corpus_length) cut_point = int(corpus_length / 4) hfold1 = haiku_random[0:cut_point] hfold2 = haiku_random[cut_point:(cut_point * 2)] hfold3 = haiku_random[(cut_point * 2):(cut_point * 3)] hfold4 = haiku_random[(cut_point * 3):] pfold1 = poetry_random[0:cut_point] pfold2 = poetry_random[cut_point:(cut_point * 2)] pfold3 = poetry_random[(cut_point * 2):(cut_point * 3)] pfold4 = poetry_random[(cut_point * 3):] train_set = hfold1 + hfold2 + hfold3 + pfold1 + pfold2 + pfold3 test_set = hfold4 + pfold4 #train the classifier nb_classifier = NaiveBayesClassifier.train([e[1] for e in train_set]) nb_classifier.labels() #check accuracy of classifier and store accuracy measure random_scores.append( accuracy(nb_classifier, [el[1] for el in test_set])) return random_scores
def trainDanger(): danger = [] with open("./anger.txt") as f: for i in f: danger.append([format_sentence(i), 'danger']) calm = [] with open("./calm.txt") as f: for i in f: calm.append([format_sentence(i), 'calm']) training = danger[:int((.8) * len(danger))] + calm[:int((.8) * len(calm))] test = danger[int((.8) * len(danger)):] + calm[int((.8) * len(calm)):] from nltk.classify import NaiveBayesClassifier classifier = NaiveBayesClassifier.train(training) from nltk.classify.util import accuracy print("Test data accuracy" + str(accuracy(classifier, test))) classifier.show_most_informative_features() return classifier
def cross_validate(self): """ Performs cross validation by training the model on 90% of the corpus then checking the accuracy on the remaining 10%. """ start = time.time() feats = self.featureset() offset = len(feats) / 10 random.shuffle(feats) train = feats[:offset] test = feats[offset:] classifier, _ = self.train(train) self.accuracy = accuracy(classifier, test) self.validtime = time.time() - start
def rte_classifier(trainer, features=rte_features): """ Classify RTEPairs """ train = ((pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])) test = ((pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])) # Train up a classifier. print('Training classifier...') classifier = trainer( [(features(pair), label) for (pair,label) in train] ) # Run the classifier on the test data. print('Testing classifier...') acc = accuracy(classifier, [(features(pair), label) for (pair,label) in test]) print('Accuracy: %6.4f' % acc) # Return the classifier return classifier
def cross_validate(): training_set = load_training_set() random.shuffle(training_set) average = 0 cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None) for traincv, evalcv in cv: classifier = NaiveBayesClassifier.train( training_set[traincv[0]:traincv[len(traincv) - 1]]) acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]]) print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1] print 'Accuracy: %4.2f' % acc average += acc print 'Average accuracy: %4.2f' % (average / 10)
def train(cleanedDataCollection, word_features, tagToBeTrained, high_info_wordSet): random.shuffle(cleanedDataCollection) featuresets = [(extractTfIdfFeatureOfADocument(d,word_features, high_info_wordSet), c) for (d,c) in cleanedDataCollection] train_set, test_set = featuresets[800:], featuresets[:800] # classifier = nltk.NaiveBayesClassifier.train(train_set) # print(nltk.classify.accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) # return classifier sk_classifier = SklearnClassifier(MultinomialNB()) sk_classifier.train(train_set) print "accuracy is: %s" % (accuracy(sk_classifier, test_set)) precision, recall, fMeasure = precision_recall_fmeasure(sk_classifier, test_set, tagToBeTrained) print "precision is: %s" % (precision) print "recall is: %s" % (recall) print "F-measure is: %s" % (fMeasure) return sk_classifier
def trainAndTest(trFeatureSets, trLabels, teFeatureSets, teLabels, types, toPrint): classifier = NaiveBayesClassifier.train(mapFeaturesToLabels(trFeatureSets, trLabels)) nuLabels = classifier.classify_many(teFeatureSets) if toPrint: print 'Test labels:', teLabels print 'New labels:', nuLabels print 'Accuracy: %.2f' % accuracy(classifier, mapFeaturesToLabels(teFeatureSets, teLabels)) # classifier.show_most_informative_features() teIndices = getLabelIndices(teLabels, types) nuIndices = getLabelIndices(nuLabels, types) teCounts = Counter(teLabels) nuCounts = Counter(nuLabels) metrices = [] for type in types: matches = len(teIndices[type] & nuIndices[type]) precision = 1.0 * matches / nuCounts[type] if nuCounts[type] > 0 else '-' recall = 1.0 * matches / teCounts[type] if teCounts[type] > 0 else '-' metrices += [[type, precision, recall]] print tabulate(metrices, ['LABEL', 'PRECISION', 'RECALL'], tablefmt='fancy_grid', floatfmt='.2f')
def training(): pos_sen = open("positive.txt", 'r', encoding = 'latin-1').read() neg_sen = open("negative.txt", 'r', encoding = 'latin-1').read() emoji = open("emoji.txt",'r', encoding = 'latin-1').read() pos_emoji = [] neg_emoji = [] for i in emoji.split('\n'): exp = '' if i[len(i)-2] == '-': for j in range(len(i) - 2): exp += i[j] neg_emoji.append(( {exp : True}, 'negative')) else: for j in range(len(i)-1): exp += i[j] pos_emoji.append(( {exp : True}, 'positive')) prev = [(features(words), 'positive') for words in pos_sen.split('\n')] nrev = [(features(words), 'negative') for words in neg_sen.split('\n')] pos_set = prev + pos_emoji neg_set = nrev + neg_emoji real_classifier = NaiveBayesClassifier.train(prev+nrev) # SAVE IN FILE TO AVOID TRAIINING THE DATA AGAIN save_doc = open("classifier.pickle", 'wb') pickle.dump(real_classifier, save_doc) save_doc.close() #TO TEST ACCURACY OF CLASSIFIER UNCCOMMENT THE CODE BELOW #ACCURACY : 78.1695423855964 ncutoff = int(len(nrev)*3/4) pcutoff = int(len(prev)*3/4) train_set = nrev[:ncutoff] + prev[:pcutoff] + pos_emoji + neg_emoji test_set = nrev[ncutoff:] + prev[pcutoff:] test_classifier = NaiveBayesClassifier.train(train_set) print("Accuracy is : ", util.accuracy(test_classifier, test_set) * 100)
def main(): pid = movie_reviews.fileids('neg') nid = movie_reviews.fileids('pos') prev = [(features(movie_reviews.words(fileids=id)), 'positive') for id in pid] nrev = [(features(movie_reviews.words(fileids=id)), 'negative') for id in nid] ncutoff = int(len(nrev) * 3 / 4) pcutoff = int(len(prev) * 3 / 4) train_set = nrev[:ncutoff] + prev[:pcutoff] test_set = nrev[ncutoff:] + prev[pcutoff:] # NaiveBayesClassifier classifier = NaiveBayesClassifier.train(train_set) # Accuracy print("Accuracy is : ", util.accuracy(classifier, test_set) * 100)
def makePrediction(): labels = movie_reviews.categories() print("Labels for reviews are: {}\n".format(labels) ) labeled_words = [(label, movie_reviews.words(categories=[label])) for label in labels] print("Labeled words:{}\n".format(labeled_words[:10])) high_info_words = set(Toolbox.high_information_words(labeled_words)) print("High information words:{}\n".format(list(high_info_words)[:10])) feat_det = lambda words: Toolbox.bag_of_words_in_set(words, high_info_words) lfeats = Toolbox.label_feats_from_corpus(movie_reviews, feature_detector=feat_det) train_feats, test_feats = Toolbox.split_label_feats(lfeats) mv_classifier = ClassifierTrainer.trainClassifier(train_feats) accuracyScore = accuracy(mv_classifier, test_feats) print("Accuracy is {}".format(accuracyScore))
def train(self): print 'Classifier Training in progress....' poscutoff = len(self.positiveFeatures) negcutoff = len(self.negativeFeatures) print "Train Pos Cutoff: " + str( poscutoff) + " Train Neg Cutoff: " + str(negcutoff) trainfeats = self.positiveFeatures[: poscutoff] + self.negativeFeatures[: negcutoff] testfeats = self.test() print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) self.classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', accuracy(self.classifier, testfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = self.classifier.classify(feats) #print label, observed testsets[observed].add(i) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
def trainLove(): #Delete common chars like comma regex = re.compile('[\[,\.!?—\]]') pos = [] with open("./pos_rom.txt") as f: for i in f: pos.append([format_sentence(regex.sub('', i.lower())), 'pos']) neg = [] with open("./neg_rom.txt") as f: for i in f: neg.append([format_sentence(regex.sub('', i.lower())), 'neg']) # next, split labeled data into the training and test data training = pos[:int((.8) * len(pos))] + neg[:int((.8) * len(neg))] test = pos[int((.8) * len(pos)):] + neg[int((.8) * len(neg)):] from nltk.classify import NaiveBayesClassifier classifier = NaiveBayesClassifier.train(training) from nltk.classify.util import accuracy print("Test data accuracy" + str(accuracy(classifier, test))) classifier.show_most_informative_features() data = [] #with open("./raw.txt", 'r', encoding='utf-8') as f: # data=f.read() return classifier
def main(): #print(movie_reviews) ##Importing The movie_reviews dataset pos and neg review pid = movie_reviews.fileids('neg') nid = movie_reviews.fileids('pos') prev = [(features(movie_reviews.words(fileids = id)), 'positive') for id in pid] nrev = [(features(movie_reviews.words(fileids = id)), 'negative') for id in nid] ncutoff = int(len(nrev)*3/4) pcutoff = int(len(prev)*3/4) ##Importing The movie_reviews dataset pos and neg review #Training and Testing Sets train_set = nrev[:ncutoff] + prev[:pcutoff] test_set = nrev[ncutoff:] + prev[pcutoff:] #Training and Testing Sets # Training Text Classification Model and Evaluating The Model classifier = NaiveBayesClassifier.train(train_set) # Accuracy print ("Accuracy is : ", util.accuracy(classifier, test_set) * 100)
def split_label_feats(lfeats, split=0.90): train_feats = [] test_feats = [] for label, feats in lfeats.iteritems(): random.shuffle(feats, random.random) cutoff = int(len(feats) * split) train_feats.extend([(feat, label) for feat in feats[:cutoff]]) test_feats.extend([(feat, label) for feat in feats[cutoff:]]) return train_feats, test_feats reader.categories() lfeats = label_feats_from_corpus(reader) lfeats.keys() train_feats, test_feats = split_label_feats(lfeats) len(train_feats) len(test_feats) from nltk.classify import NaiveBayesClassifier nb_classifier = NaiveBayesClassifier.train(train_feats) nb_classifier.labels() from nltk.classify.util import accuracy accuracy(nb_classifier, test_feats)
from nltk.classify.util import accuracy def bag_of_words(words): return dict([(word, True) for word in words]) train_feats =[]; test_feats=[]; #print(movie_reviews.categories()); #list categories #There are 1000 positive files and 1000 negative files for label in movie_reviews.categories(): #print [label]; #display neg,pos i=0; for fileid in movie_reviews.fileids([label]): i=i+1; words=movie_reviews.words(fileid); bag=bag_of_words(words); features=[bag,label]; if i < 750: train_feats.append(features); else: test_feats.append(features); #print len(train_feats); classifier = NaiveBayesClassifier.train(train_feats); #Naive bayes classifier #test the data posi=bag_of_words(['kate','winslet','is','accessible']); nega=bag_of_words(['the','plot','was','ludicrous']); print classifier.classify(nega); print classifier.classify(posi); #display accuracy print(accuracy(classifier,test_feats));
classifier = nltk.NaiveBayesClassifier.train(training_set) def train(labeled_featuresets, estimator=ELEProbDist): # Create the P(label) distribution label_probdist = estimator(label_freqdist) # Create the P(fval|label, fname) distribution feature_probdist = {} return NaiveBayesClassifier(label_probdist, feature_probdist) #print "Most informative features" #print classifier.show_most_informative_features(32) print "-"*50 tweet = 'Multiple users on one machine never seemed to behave like youd expect' print tweet print "___/^" #print extract_features(tweet.split()) print classifier.classify(extract_features(tweet.split())) print "-"*50 print listare[0] for texte in listare: classifier.classify(extract_features(texte[0])) acc = accuracy(classifier, training_set) * 100 print 'accuracy: %.2f%%' % acc
feature[word] = True pdata.append((feature, 'POSITIVE')) ndata = [] fileids = nc.movie_reviews.fileids('neg') for fileid in fileids: feature = {} words = nc.movie_reviews.words(fileid) for word in words: feature[word] = True ndata.append((feature, 'NEGATIVE')) pnumb, nnumb = \ int(0.8 * len(pdata)), int(0.8 * len(ndata)) train_data = pdata[:pnumb] + ndata[:nnumb] test_data = pdata[pnumb:] + ndata[nnumb:] model = cf.NaiveBayesClassifier.train(train_data) ac = cu.accuracy(model, test_data) print('%.2f%%' % round(ac * 100, 2)) reviews = [ 'It is an amazing movie.', 'This is a dull movie. I would never recommand it to anyone.', 'The cinemagraphy is pretty great in this movie', 'The direction was terrible and the story was all over the place' ] sents, probs = [], [] for review in reviews: feature = {} words = review.split(' ') for word in words: feature[word] = True pcls = model.prob_classify(feature) sent = pcls.max()
print 'training a multi-binary %s classifier' % args.algorithm classifier = MultiBinaryClassifier.train(labels, train_feats, trainf, **train_kwargs) else: if args.trace: print 'training a %s classifier' % args.algorithm classifier = trainf(train_feats, **train_kwargs) ################ ## evaluation ## ################ if not args.no_eval: if not args.no_accuracy: print 'accuracy: %f' % accuracy(classifier, test_feats) if args.multi and args.binary and not args.no_masi_distance: print 'average masi distance: %f' % (scoring.avg_masi_distance(classifier, test_feats)) if not args.no_precision or not args.no_recall or not args.no_fmeasure: if args.multi and args.binary: refsets, testsets = scoring.multi_ref_test_sets(classifier, test_feats) else: refsets, testsets = scoring.ref_test_sets(classifier, test_feats) for label in labels: ref = refsets[label] test = testsets[label] if not args.no_precision:
def wordsInCorpus(corpus): words = list() for line in corpus: for word in line.split(): if word not in stopWords: words.append(normalize(word)) return words posCorpus = open("finalPositiveCorpus.txt", "r") poswords = wordsInCorpus(posCorpus) negCorpus = open("finalNegativeCorpus.txt", "r") negwords = wordsInCorpus(negCorpus) #list of tuples (word, label) for every non-stop word that occurs in each corpus labeled_features = ([(word, 'pos') for word in poswords] + [(word, 'neg') for word in negwords]) import random random.shuffle(labeled_features) cutOff = len(labeled_features) * 3/4 train_set, test_set = labeled_features[cutOff:], labeled_features[:cutOff] #this is where the script crashes #This WOULD work if NaiveBayesClassifier.train() worked on lists #But it only works on dictionaries, contrary to what documentation says classifier = NaiveBayesClassifier.train(train_set) print 'accuracy:', accuracy(classifier, test_set)
arquivoClassificador.close() arquivoClassificados = open('classificados.json') classificados = ujson.load(arquivoClassificados) arquivoClassificados.close() sentimentos = {} featuresClassificados = [] comeco = datetime.now() for resposta in classificados: texto = resposta['corpo'] frases = tokenizerFrases.tokenize(texto) feature = {} for frase in frases: palavras = tokenizerPalavras.tokenize(frase) palavras = [palavra for palavra in palavras if palavra not in stopWords] for palavra in palavras: feature[palavra] = True sentimentos[texto] = (resposta, classificador.classify(feature)) featuresClassificados.append((feature, resposta['sentimento'])) tempo = datetime.now() - comeco arquivoMedicoes = open('medicoes_analise_sequencial.txt', 'w') arquivoMedicoes.write('Tempo de Execução = ' + str(tempo) + '\nPrecisão = {0:.2f}%'.format(accuracy(classificador, featuresClassificados) * 100)) arquivoMedicoes.close() arquivoResultados = open('resultados_sem_stopwords.csv', 'w', newline='') w = writer(arquivoResultados, delimiter=',') linhas = [['Resposta', 'Pontos', 'Sentimento - Naive Bayes', 'Sentimento - AlchemyAPI']] for texto in sentimentos.keys(): tupla = sentimentos[texto] resposta = tupla[0] linhas.append([texto, resposta['pontos'], tupla[1], resposta['sentimento']]) w.writerows(linhas) arquivoResultados.close()
def word_features(words): return dict([(word, True) for word in words]) # Get all the movie reviews with positive data set and negative data set posRev = movie_reviews.fileids('pos') negRev = movie_reviews.fileids('neg') # Mark the words in data set as positive and negative: posWords = [(word_features(movie_reviews.words(fileids=[f])), 'pos') for f in posRev] negWords = [(word_features(movie_reviews.words(fileids=[f])), 'neg') for f in negRev] # Set cut off for separating the training data and the testing data: posCutoff = len(posWords) * 50 / 100 negCutoff = len(negWords) * 50 / 100 # Fill the training data and the testing data with positive and negative data set: TestRev = posWords[posCutoff:] + negWords[negCutoff:] Test_set = array(TestRev) TrainRev = posWords[:posCutoff] + negWords[:negCutoff] Train_set = array(TrainRev) print 'train on %d instances, test on %d instances' % (len(Train_set), len(Test_set)) # Call Maximum Entropy classifier to classify the training data: algo = MaxentClassifier.ALGORITHMS[0] classifier = MaxentClassifier.train(Train_set, algorithm=algo, max_iter=3) classifier.show_most_informative_features(10) # Print the algorithm accuracy print 'Accuracy is', util.accuracy(classifier, Test_set)
elif sentimento == 'negativo': negativos.append(resposta) else: neutros.append(resposta) threads = [] comeco = datetime.now() for resposta in classificados: thread = ThreadSentimento(resposta) threads.append(thread) thread.start() for thread in threads: thread.join() tempo = datetime.now() - comeco iteracao = iteracao + 1 arquivoMedicoes = open('medicoes_analise_threads_' + str(iteracao) + '.txt', 'w') precisao = accuracy(classificador, featuresClassificados) * 100 arquivoMedicoes.write('Tempo de Execução = ' + str(tempo) + '\nPrecisão = {0:.2f}%'.format(precisao)) arquivoMedicoes.close() features = resultadoPositivos.get() + resultadoNegativos.get() + resultadosNeutros.get() pool1.terminate() pool1.close() pool2.terminate() pool2.close() pool3.terminate() pool3.close() if precisao > 50: features.extend(featuresClassificados) shuffle(features) classificador = NaiveBayesClassifier.train(features) arquivoClassificador = open('classificador.pickle', 'wb') dump(classificador, arquivoClassificador, protocol=HIGHEST_PROTOCOL)