def test_reviews_bag(self): """Classify sentiment using bag-of-words""" train, dev, test = self.split_review_corpus(BagOfWords) classifier = MaxEnt() classifier.train(train, dev) acc = accuracy(classifier, test) self.assertGreater(acc, 0.55)
def test_names_nltk(self): """Classify names using NLTK features""" train, dev, test = self.split_names_corpus() classifier = MaxEnt() classifier.train(train, dev) acc = accuracy(classifier, test) self.assertGreater(acc, 0.70)
def test_names_nltk(self): """Classify names using NLTK features""" train, dev, test = self.split_names_corpus() classifier = MaxEnt() classifier.train(train, dev, 0.1) acc = accuracy(classifier, test) self.assertGreater(acc, 0.70)
def test_sparse_vector(self): maxent = MaxEnt() _ = maxent.compute_observed_counts(self.training_set) woof_index = maxent.feature_alphabet.get_index('woof') meow_index = maxent.feature_alphabet.get_index('meow') sparse_vector = self.training_set[1].data self.assertEqual(meow_index, sparse_vector[0]) self.assertEqual(woof_index, sparse_vector[1])
def test_model(): train, dev, test, labels = get_data() classifier = MaxEnt(train, labels) classifier.train(train, dev, 0.001, 100) print("Test result", accuracy(classifier, test))
def test_reviews(self): """Classify sentiment using bag-of-words""" reviews = ReviewCorpus('yelp_reviews.json', document_class=Review, numLines=15000) train, dev, test = self.split_review_corpus(reviews) print 'train length = ', len(train), ' dev length = ', len(dev), ' test length = ', len(test) print 'number of features = ', len(train[0].features()) classifier = MaxEnt() classifier.train(train, dev) self.assertGreater(accuracy(classifier, test), 0.55)
def test_label_loglikelihood_score(self): maxent = MaxEnt() _ = maxent.compute_observed_counts(self.training_set) maxent.parameters = numpy.array([0.1, 0.5, 1, 0, 100, -0.1, -0.5, -1, 0, -100]) score_vector = maxent.compute_label_unnormalized_loglikelihood_vector(self.training_set[0].data) self.assertEqual(score_vector[0], 1.6) self.assertEqual(score_vector[1], -1.6) posterior_distribution = maxent.compute_posterior_distribution(self.training_set[0]) self.assertAlmostEqual(posterior_distribution[0], 0.96, 2)
def test_posterior(self): classifier = MaxEnt() classifier.labelsToWeights = {'cat': [2, -3], 'dog': [-2, 3]} self.assertAlmostEqual(classifier.posterior('cat', np.array([0, 1])), .00247, 4) self.assertAlmostEqual(classifier.posterior('cat', np.array([1, 0])), .98201, 4) self.assertAlmostEqual(classifier.posterior('dog', np.array([0, 1])), .99753, 4) self.assertAlmostEqual(classifier.posterior('dog', np.array([1, 0])), .01799, 4) self.assertAlmostEqual(classifier.posterior('cat', np.array([1, 1])), .11920, 4) self.assertAlmostEqual(classifier.posterior('dog', np.array([1, 1])), .88080, 4)
def test_gradient(self): classifier = MaxEnt() classifier.labelsToWeights = {'cat': [2, -3], 'dog': [-2, 3]} dataset = ([Animal(np.array([1, 0]), 'cat'), Animal(np.array([0, 1]), 'cat'), Animal(np.array([0, 1]), 'dog'), Animal(np.array([0, 1]), 'dog'), Animal(np.array([1, 1]), 'cat')]) gradient = classifier.gradient(dataset) self.assertAlmostEqual(gradient['cat'][0], .89879, 4) self.assertAlmostEqual(gradient['cat'][1], 1.87339, 4) self.assertAlmostEqual(gradient['dog'][0], -.89879, 4) self.assertAlmostEqual(gradient['dog'][1], -1.87339, 4)
def test_observed_counts(self): classifier = MaxEnt() classifier.labelsToWeights = {'cat': [2, -3], 'dog': [-2, 3]} dataset = ([Animal(np.array([1, 0]), 'cat'), Animal(np.array([0, 1]), 'cat'), Animal(np.array([0, 1]), 'dog'), Animal(np.array([0, 1]), 'dog'), Animal(np.array([1, 1]), 'cat')]) observedCounts = classifier.observedCounts(dataset) self.assertEqual(observedCounts['cat'][0], 2) self.assertEqual(observedCounts['cat'][1], 2) self.assertEqual(observedCounts['dog'][0], 0) self.assertEqual(observedCounts['dog'][1], 2)
def test_prediction(self): maxent = MaxEnt() maxent.gaussian_prior_variance = numpy.Infinity maxent.train(self.training_set) predictions = [maxent.classify_instance(x) for x in self.training_set] self.assertEqual(predictions[0], 'cat') self.assertEqual(predictions[1], 'dog') self.assertEqual(predictions[2], 'cat') self.assertEqual(predictions[3], 'dog') self.assertEqual(predictions[4], 'dog')
def test_expected_counts(self): classifier = MaxEnt() classifier.labelsToWeights = {'cat': [2, -3], 'dog': [-2, 3]} dataset = ([Animal(np.array([1, 0]), 'cat'), Animal(np.array([0, 1]), 'cat'), Animal(np.array([0, 1]), 'dog'), Animal(np.array([0, 1]), 'dog'), Animal(np.array([1, 1]), 'cat')]) expectedCounts = classifier.expectedModelCounts(dataset) self.assertAlmostEqual(expectedCounts['cat'][0], 1.10121, 4) self.assertAlmostEqual(expectedCounts['cat'][1], .12661, 4) self.assertAlmostEqual(expectedCounts['dog'][0], .89879, 4) self.assertAlmostEqual(expectedCounts['dog'][1], 3.87339, 4)
def test_save_load(self): reviews = ReviewCorpus('yelp_reviews.json', document_class=Review, numLines=5000) train, dev, test = self.split_review_corpus(reviews) classifier1 = MaxEnt() classifier1.train(train, dev) classifier1.save('trained.model') classifier2 = MaxEnt() classifier2.load('trained.model') for doc in reviews[:500]: self.assertEqual(classifier1.classify(doc), classifier2.classify(doc))
def test_reviews_bag(self): """Classify sentiment using bag-of-words""" train, dev, test, labels, features = self.split_review_corpus(Review) classifier = MaxEnt() classifier.train(train, labels, features, dev) classifier.save("mikes_model") class2 = MaxEnt() class2.load("mikes_model") self.assertGreater(accuracy(classifier, test), 0.55)
def test_observed_counts(self): maxent = MaxEnt() observed_counts = maxent.compute_observed_counts(self.training_set) cat_index = maxent.label_alphabet.get_index('cat') purr_index = maxent.feature_alphabet.get_index('purr') meow_index = maxent.feature_alphabet.get_index('meow') bark_index = maxent.feature_alphabet.get_index('bark') woof_index = maxent.feature_alphabet.get_index('woof') cat_related_indices = maxent.get_parameter_indices([purr_index, meow_index, bark_index, woof_index], cat_index) cat_count, cat_purr_count, cat_meow_count, cat_bark_count, cat_woof_count = observed_counts[cat_related_indices] self.assertEqual(cat_count, 3) self.assertEqual(cat_purr_count, 2) self.assertEqual(cat_meow_count, 2) self.assertEqual(cat_bark_count, 0) self.assertEqual(cat_woof_count, 1)
def test_names_nltk(self): """Classify names using NLTK features""" train, dev, test = self.split_names_corpus() classifier = MaxEnt() classifier.labels = ['female', 'male'] classifier.vocab = self.get_names_vocab(train) classifier.feature_vectors(train + dev + test) classifier.train(train, dev) acc = accuracy(classifier, test) self.assertGreater(acc, 0.70)
def test_reviews_bag(self): """Classify sentiment using bag-of-words""" train, dev, test = self.split_review_corpus(BagOfWords) classifier = MaxEnt() classifier.labels = ['positive', 'negative', 'neutral'] classifier.vocab = self.get_review_vocab(train) print('...creating sparse feature vectors...') classifier.feature_vectors(train + dev + test) print('...training...') classifier.train(train, dev) self.assertGreater(accuracy(classifier, test), 0.55)
def __init__(self): features = [self.length, self.num_vowels, self.num_consonants, self.vowel_ratio, self.is_capital(0)] for c1 in string.ascii_lowercase: features.append(self.contains(c1)) for c2 in string.ascii_lowercase: features.append(self.contains(c1 + c2)) for i in range(0, 15): for c in string.ascii_lowercase: features.append(self.char_is(i, c)) features.append(self.is_vowel(i)) for i in range(1, 5): for c in string.ascii_lowercase: features.append(self.char_is(-1*i, c)) features.append(self.is_vowel(-1*i)) self.classifier = MaxEnt(classes=["male", "female", "other"], features=features)
# train_set,test_set = featuresets[:],testsets[:] # classifier = nltk.NaiveBayesClassifier.train(train_set) # print nltk.classify.accuracy(classifier, test_set) # #print classifier.labels() # CM = test_classifier(classifier,test_set) # CM.print_out() #---------------------------------- #MaxEnt training #ME = MaxEnt() #ME.train(instance_list) #ME.save("dependency_parsing_classifier.json") #finish training #---------------------------------- #testing parser and then use the loaded ME classifier to do the decoding thing and write the parsing result in file parser.conll ME = MaxEnt.load("dependency_parsing_classifier.json") CM = test_classifier(ME,test_instance_list) CM.print_out() tranSys = TranSys(transition_codebook) wfile = open('parser.conll','w') for test_sentence in test_sentence_instances: new_sentence = tranSys.decode_parser(ME,test_sentence) for element in new_sentence: if element[0] != 0: #wfile.write('{0:<10}{1:<15}{2:<10}{3:<10}{4:<10}{5:<10}{6:<10}{7:<10}{8:<10}{9:<10}'.format(element[0],element[1],'_',element[2],element[2],'_',element[3],'_','_','_')) wfile.write(str(element[0])+'\t'+str(element[1])+'\t'+'_'+'\t'+str(element[2])+'\t'+str(element[2])+'\t'+'_'+'\t'+str(element[3])+'\t'+str(element[4])+'\t'+'_'+'\t'+'_') wfile.write("\n") wfile.write("\r\n") wfile.close()
class Gender: vowels = ['a', 'e', 'i', 'o', 'u', 'y'] def char_is(self, n, char): return lambda word: abs(n) < len(word) and word[n] == char def length(self, word): return len(word) def char_in(self, char): return lambda word: char in word def letter_vowel(self, char): return char in self.vowels def num_vowels(self, word): return len([c for c in word if self.letter_vowel(c)]) def num_consonants(self, word): return len(word) - self.num_vowels(word) def vowel_ratio(self, word): return self.num_vowels(word)/float(len(word)) def is_vowel(self, n): return lambda word: abs(n) < len(word) and word[n] in self.vowels def contains(self, c): return lambda word: c in word.lower() def is_capital(self, n): return lambda word: abs(n) < len(word) and word[n].isupper() def __init__(self): features = [self.length, self.num_vowels, self.num_consonants, self.vowel_ratio, self.is_capital(0)] for c1 in string.ascii_lowercase: features.append(self.contains(c1)) for c2 in string.ascii_lowercase: features.append(self.contains(c1 + c2)) for i in range(0, 15): for c in string.ascii_lowercase: features.append(self.char_is(i, c)) features.append(self.is_vowel(i)) for i in range(1, 5): for c in string.ascii_lowercase: features.append(self.char_is(-1*i, c)) features.append(self.is_vowel(-1*i)) self.classifier = MaxEnt(classes=["male", "female", "other"], features=features) def train(self, names): return self.classifier.train(names) def guess(self, word): if word in ['she', 'her']: return 'female' elif word in ['he', 'him']: return 'male' else: return self.classifier.predict(word)
def test_names_nltk(self): train, dev, test = self.split_names_corpus() classifier = MaxEnt() classifier.train(train, dev) acc = accuracy(classifier, test) self.assertGreater(acc, 0.70)
def test_reviews_bag(self): """Classify sentiment using bag-of-words""" train, dev, test = self.split_review_corpus(BagOfWords) classifier = MaxEnt() classifier.train(train, dev) self.assertGreater(accuracy(classifier, test), 0.55)
class BagOfWords(Document): def features(self): """Trivially tokenized words.""" return "bagofwords" class Name(Document): def features(self): name = self.data return ['First=%s' % name[0], 'Last=%s' % name[-1]] class Bigram(Document): def features(self): #different features generating mode return "bigram" classifier = MaxEnt() instances = ReviewCorpus('yelp_reviews.json',document_class=BagOfWords) # ##experiment 1 # print('experiment 1') # y1 = [] # x = [] # lengths = [1000,10000,50000,100000,len(instances.documents)] # for length in lengths: # score = classifier.train(instances, maxlength=length, batch_size=30, l2_value=0.1, dev_instances=None) # print("score:",score) # y1.append(score) # x.append(str(length)) # # plt.plot(x,y1) # for xy in zip(x, y1):
self.assertGreater(acc, 0.70) def split_review_corpus(self, document_class): """Split the yelp review corpus into training, dev, and test sets""" reviews = ReviewCorpus('yelp_reviews.json', document_class=document_class) seed(hash("reviews")) shuffle(reviews) return (reviews[:10000], reviews[10000:11000], reviews[11000:14000]) def test_reviews_bag(self): """Classify sentiment using bag-of-words""" train, dev, test = self.split_review_corpus(BagOfWords) classifier = MaxEnt() classifier.train(train, dev) self.assertGreater(accuracy(classifier, test), 0.55) ''' if __name__ == '__main__': docs = Corpus('./gold_standard_all.txt', True) seed(time.time()) shuffle(docs) first_80 = round(0.8 * len(docs)) second_10 = first_80 + round(0.1 * len(docs)) train, dev, test = (docs[:first_80], docs[first_80:second_10], docs[second_10:]) classifier = MaxEnt() classifier.train(train, dev) print(accuracy(classifier, test))
def main(): parser = AP.ArgumentParser(description = "A command-line interface for " \ "the maximum entropy classifier.") parser.add_argument("-d", "--datafile", action = "store", default = "blog-gender-dataset.txt", \ help = "specify the input data file (default: ") parser.add_argument("-g", "--gaussian_prior", dest = "gpv", action = "store", \ help = "specify the Gaussian prior variance") parser.add_argument("-m", "--mode", dest = "mode", action = "store", default = "train", \ help = "run as train, train/ test, exp(eriment)1, exp(eriment)2, exp(eriment)3") parser.add_argument("-s", "--save", dest = "outfile", action = "store", default = None, \ help = "specify output file to serialize trained classifier") parser.add_argument("-l", "--load", dest = "infile", action = "store", default = None, \ help = "specify input file to load trained classifier") parser.add_argument("-i", "--instances", dest = "instances", action = "store", default = None, \ help = "load preprocessed instances instead of data") parser.add_argument("-f", "--featurefile", dest = "featfile", action = "store", default = None, \ help = "serialize preprocessed instances") args = parser.parse_args() #parse argument structure #begin running classifier try: print "Importing data ... " if args.instances: #get serialized features instance_list = cPickle.load(open(args.instances, 'rb')) print "Done." else: #create features from data data_list = import_data(args.datafile) print "Done.\nExtracting features ... " instance_list = [] l = len(data_list) for i, (label, post) in enumerate(data_list): print "Featurizing string %d of %d ... " % (i, l) instance_list.append(Instance(label = label, data = featurize(post))) print "Done." if args.featfile: #serialize instance_list with open(args.featfile, 'wb') as outf: cPickle.dump(instance_list, outf) piv1 = int(.7 * len(instance_list)) #split training from test piv2 = int(.9 * len(instance_list)) #split test from dev training, test, dev = instance_list[:piv1], instance_list[piv1:piv2], \ instance_list[piv2:] if args.infile: #load a previously trained classifier with open(args.infile, 'rb') as inf: me_classifier = MaxEnt.from_dict(cPickle.load(inf)) else: #create a new classifier exec('me_classifier = MaxEnt(%s)' % args.gpv) #experiment one if re.search(r'exp.*1', args.mode): if not args.infile: print "Training classifier ... " me_classifier.train(training) print "Done.\nTesting classification ... " if args.outfile: with open(args.outfile, 'wb') as outf: cPickle.dump(me_classifier.to_dict(), outf) for data in [training, test]: test_classifier(me_classifier, data).print_out() #experiment two; run in batch as for i in {.05,...,numpy.Infinity} ... #run with -s $i.classifier elif re.search(r'exp.*2', args.mode): #for value in [.05, 0.1, .5, 1, 3, 5, 10, numpy.Infinity]: #for value in [10, numpy.Infinity]: #me_classifier = MaxEnt(value) print "Training classifier with Gaussian prior variance %s ..." \ % str(me_classifier.gaussian_prior_variance) me_classifier.train(training) print "Done. Testing classifier over dev set ..." test_classifier(me_classifier, dev).print_out() print "Done. Testing classifier over test set ..." test_classifier(me_classifier, test).print_out() print "Done.\n\n\n" #experiment three; run with -l 1.classifier elif re.search(r'exp.*3', args.mode): if not args.infile: print "Training Maximum Entropy classifier ... " me_classifier.train(training) print "Done." nb_classifier = NaiveBayes() print "Training Naive Bayes classifier ... " nb_classifier.train(training) print "Done.\nTesting Maximum Entropy over test set ... " test_classifier(me_classifier, test).print_out() print "Done.\nTesting Naive Bayes over test set ... " test_classifier(nb_classifier, test).print_out() if args.outfile: #serialize trained classifier with open(args.outfile, 'wb') as outf: cPickle.dump(me_classifier.to_dict(), outf) except: #something is WROOOONG parser.print_help() raise
for i in range(reps): print(">>>>iteration", i) reward, states_visited, steps = run_episode(env, valueFunction, n, False, EPSILON) #compute feature counts fcounts = compute_feature_counts(fMap, states_visited, discount, env) print("steps = ", steps) #print("feature count = ", fcounts) features.append(fcounts) features = np.array(features) #compute expected feature counts for demos emp_feature_cnts = np.mean(features[skip_time:], axis=0) ment = MaxEnt(solve_mdp, fMap, env, num_fcount_rollouts, discount) maxent_value_fn = ment.get_opt_policy(emp_feature_cnts, learning_rate, num_steps) #pickle the controller (value function) #with open('mcar_maxent_policy_ss.pickle', 'wb') as f: # pickle.dump(maxent_value_fn, f, pickle.HIGHEST_PROTOCOL) #with open('mcar_maxent_policy_ss.pickle', 'rb') as f: # vFunc = pickle.load(f) #evaluate maxent learned policy returns = evaluate_softmax_policy(env, eval_rollouts, maxent_value_fn) print("average return", np.mean(returns)) for r in returns:
def test_reviews_words(self): """Classify sentiment using sentiment words""" train, dev, test = self.split_review_corpus(SentimentWords) classifier = MaxEnt() classifier.train(train, dev) self.assertGreater(accuracy(classifier, test), 0.55)
def test_reviews_bag(self): train, dev, test = self.split_review_corpus(BagOfWords) classifier = MaxEnt() classifier.train(train, dev) self.assertGreater(accuracy(classifier, test), 0.55)