def test_reviews_bag(self):
     """Classify sentiment using bag-of-words"""
     train, dev, test = self.split_review_corpus(BagOfWords)
     classifier = MaxEnt()
     classifier.train(train, dev)
     acc = accuracy(classifier, test)
     self.assertGreater(acc, 0.55)
 def test_names_nltk(self):
     """Classify names using NLTK features"""
     train, dev, test = self.split_names_corpus()
     classifier = MaxEnt()
     classifier.train(train, dev)
     acc = accuracy(classifier, test)
     self.assertGreater(acc, 0.70)
示例#3
0
 def test_names_nltk(self):
     """Classify names using NLTK features"""
     train, dev, test = self.split_names_corpus()
     classifier = MaxEnt()
     classifier.train(train, dev, 0.1)
     acc = accuracy(classifier, test)
     self.assertGreater(acc, 0.70)
	def test_sparse_vector(self):
		maxent = MaxEnt()
		_ = maxent.compute_observed_counts(self.training_set)
		woof_index = maxent.feature_alphabet.get_index('woof') 
		meow_index = maxent.feature_alphabet.get_index('meow') 
		sparse_vector = self.training_set[1].data
		self.assertEqual(meow_index, sparse_vector[0])
		self.assertEqual(woof_index, sparse_vector[1])
示例#5
0
def test_model():

    train, dev, test, labels = get_data()

    classifier = MaxEnt(train, labels)
    classifier.train(train, dev, 0.001, 100)

    print("Test result", accuracy(classifier, test))
 def test_reviews(self):
     """Classify sentiment using bag-of-words"""
     reviews = ReviewCorpus('yelp_reviews.json', document_class=Review, numLines=15000)
     train, dev, test = self.split_review_corpus(reviews)
     print 'train length = ', len(train), ' dev length = ', len(dev), ' test length = ', len(test)
     print 'number of features = ', len(train[0].features())
     classifier = MaxEnt()
     classifier.train(train, dev)
     self.assertGreater(accuracy(classifier, test), 0.55)
	def test_label_loglikelihood_score(self):
		maxent = MaxEnt()
		_ = maxent.compute_observed_counts(self.training_set)
		maxent.parameters = numpy.array([0.1, 0.5, 1, 0, 100, -0.1, -0.5, -1, 0, -100])
		score_vector = maxent.compute_label_unnormalized_loglikelihood_vector(self.training_set[0].data)
		self.assertEqual(score_vector[0], 1.6)
		self.assertEqual(score_vector[1], -1.6)
		posterior_distribution = maxent.compute_posterior_distribution(self.training_set[0])
		self.assertAlmostEqual(posterior_distribution[0], 0.96, 2)
 def test_posterior(self):
     classifier = MaxEnt()
     classifier.labelsToWeights = {'cat': [2, -3], 'dog': [-2, 3]}
     self.assertAlmostEqual(classifier.posterior('cat', np.array([0, 1])), .00247, 4)
     self.assertAlmostEqual(classifier.posterior('cat', np.array([1, 0])), .98201, 4)
     self.assertAlmostEqual(classifier.posterior('dog', np.array([0, 1])), .99753, 4)
     self.assertAlmostEqual(classifier.posterior('dog', np.array([1, 0])), .01799, 4)
     self.assertAlmostEqual(classifier.posterior('cat', np.array([1, 1])), .11920, 4)
     self.assertAlmostEqual(classifier.posterior('dog', np.array([1, 1])), .88080, 4)
 def test_gradient(self):
     classifier = MaxEnt()
     classifier.labelsToWeights = {'cat': [2, -3], 'dog': [-2, 3]}
     dataset = ([Animal(np.array([1, 0]), 'cat'), Animal(np.array([0, 1]), 'cat'), Animal(np.array([0, 1]), 'dog'),
                 Animal(np.array([0, 1]), 'dog'), Animal(np.array([1, 1]), 'cat')])
     gradient = classifier.gradient(dataset)
     self.assertAlmostEqual(gradient['cat'][0], .89879, 4)
     self.assertAlmostEqual(gradient['cat'][1], 1.87339, 4)
     self.assertAlmostEqual(gradient['dog'][0], -.89879, 4)
     self.assertAlmostEqual(gradient['dog'][1], -1.87339, 4)
 def test_observed_counts(self):
     classifier = MaxEnt()
     classifier.labelsToWeights = {'cat': [2, -3], 'dog': [-2, 3]}
     dataset = ([Animal(np.array([1, 0]), 'cat'), Animal(np.array([0, 1]), 'cat'), Animal(np.array([0, 1]), 'dog'),
                 Animal(np.array([0, 1]), 'dog'), Animal(np.array([1, 1]), 'cat')])
     observedCounts = classifier.observedCounts(dataset)
     self.assertEqual(observedCounts['cat'][0], 2)
     self.assertEqual(observedCounts['cat'][1], 2)
     self.assertEqual(observedCounts['dog'][0], 0)
     self.assertEqual(observedCounts['dog'][1], 2)
	def test_prediction(self):
		maxent = MaxEnt()
		maxent.gaussian_prior_variance = numpy.Infinity
		maxent.train(self.training_set)
		predictions = [maxent.classify_instance(x) for x in self.training_set]
		self.assertEqual(predictions[0], 'cat')
		self.assertEqual(predictions[1], 'dog')
		self.assertEqual(predictions[2], 'cat')
		self.assertEqual(predictions[3], 'dog')
		self.assertEqual(predictions[4], 'dog')
 def test_expected_counts(self):
     classifier = MaxEnt()
     classifier.labelsToWeights = {'cat': [2, -3], 'dog': [-2, 3]}
     dataset = ([Animal(np.array([1, 0]), 'cat'), Animal(np.array([0, 1]), 'cat'), Animal(np.array([0, 1]), 'dog'),
                 Animal(np.array([0, 1]), 'dog'), Animal(np.array([1, 1]), 'cat')])
     expectedCounts = classifier.expectedModelCounts(dataset)
     self.assertAlmostEqual(expectedCounts['cat'][0], 1.10121, 4)
     self.assertAlmostEqual(expectedCounts['cat'][1], .12661, 4)
     self.assertAlmostEqual(expectedCounts['dog'][0], .89879, 4)
     self.assertAlmostEqual(expectedCounts['dog'][1], 3.87339, 4)
 def test_save_load(self):
     reviews = ReviewCorpus('yelp_reviews.json', document_class=Review, numLines=5000)
     train, dev, test = self.split_review_corpus(reviews)
     classifier1 = MaxEnt()
     classifier1.train(train, dev)
     classifier1.save('trained.model')
     classifier2 = MaxEnt()
     classifier2.load('trained.model')
     for doc in reviews[:500]:
         self.assertEqual(classifier1.classify(doc), classifier2.classify(doc))
示例#14
0
    def test_reviews_bag(self):
        """Classify sentiment using bag-of-words"""
        train, dev, test, labels, features = self.split_review_corpus(Review)
        classifier = MaxEnt()
        classifier.train(train, labels, features, dev)
        classifier.save("mikes_model")

        class2 = MaxEnt()
        class2.load("mikes_model")
        self.assertGreater(accuracy(classifier, test), 0.55)
	def test_observed_counts(self):
		maxent = MaxEnt()
		observed_counts = maxent.compute_observed_counts(self.training_set)
		
		cat_index = maxent.label_alphabet.get_index('cat')
		purr_index = maxent.feature_alphabet.get_index('purr') 
		meow_index = maxent.feature_alphabet.get_index('meow') 
		bark_index = maxent.feature_alphabet.get_index('bark') 
		woof_index = maxent.feature_alphabet.get_index('woof') 
		
		cat_related_indices = maxent.get_parameter_indices([purr_index, meow_index, bark_index, woof_index], cat_index)
		cat_count, cat_purr_count, cat_meow_count, cat_bark_count, cat_woof_count = observed_counts[cat_related_indices]
		self.assertEqual(cat_count, 3)
		self.assertEqual(cat_purr_count, 2)
		self.assertEqual(cat_meow_count, 2)
		self.assertEqual(cat_bark_count, 0)
		self.assertEqual(cat_woof_count, 1)
    def test_reviews_bag(self):
        """Classify sentiment using bag-of-words"""
        train, dev, test, labels, features = self.split_review_corpus(Review)
        classifier = MaxEnt()
        classifier.train(train, labels, features, dev)
        classifier.save("mikes_model")

        class2 = MaxEnt()
        class2.load("mikes_model")
        self.assertGreater(accuracy(classifier, test), 0.55)
示例#17
0
    def test_names_nltk(self):
        """Classify names using NLTK features"""
        train, dev, test = self.split_names_corpus()
        classifier = MaxEnt()
        classifier.labels = ['female', 'male']
        classifier.vocab = self.get_names_vocab(train)
        classifier.feature_vectors(train + dev + test)
        classifier.train(train, dev)

        acc = accuracy(classifier, test)
        self.assertGreater(acc, 0.70)
示例#18
0
    def test_reviews_bag(self):
        """Classify sentiment using bag-of-words"""
        train, dev, test = self.split_review_corpus(BagOfWords)
        classifier = MaxEnt()
        classifier.labels = ['positive', 'negative', 'neutral']
        classifier.vocab = self.get_review_vocab(train)
        print('...creating sparse feature vectors...')
        classifier.feature_vectors(train + dev + test)

        print('...training...')
        classifier.train(train, dev)
        self.assertGreater(accuracy(classifier, test), 0.55)
示例#19
0
    def __init__(self):
        features = [self.length,
                    self.num_vowels,
                    self.num_consonants,
                    self.vowel_ratio,
                    self.is_capital(0)]

        for c1 in string.ascii_lowercase:
            features.append(self.contains(c1))
            for c2 in string.ascii_lowercase:
                features.append(self.contains(c1 + c2))

        for i in range(0, 15):
            for c in string.ascii_lowercase:
                features.append(self.char_is(i, c))
            features.append(self.is_vowel(i))

        for i in range(1, 5):
            for c in string.ascii_lowercase:
                features.append(self.char_is(-1*i, c))
            features.append(self.is_vowel(-1*i))

        self.classifier = MaxEnt(classes=["male", "female", "other"],
                                 features=features)
示例#20
0
# train_set,test_set = featuresets[:],testsets[:] 
# classifier = nltk.NaiveBayesClassifier.train(train_set) 
# print nltk.classify.accuracy(classifier, test_set)
# #print classifier.labels()
# CM = test_classifier(classifier,test_set)
# CM.print_out()

#----------------------------------  
#MaxEnt training
#ME = MaxEnt()
#ME.train(instance_list)
#ME.save("dependency_parsing_classifier.json")
#finish training
#----------------------------------
#testing parser and then use the loaded ME classifier to do the decoding thing and write the parsing result in file parser.conll
ME = MaxEnt.load("dependency_parsing_classifier.json")
CM = test_classifier(ME,test_instance_list)
CM.print_out()
tranSys = TranSys(transition_codebook)
wfile = open('parser.conll','w')
for test_sentence in test_sentence_instances:
	new_sentence = tranSys.decode_parser(ME,test_sentence)
	for element in new_sentence:
		if element[0] != 0:
			#wfile.write('{0:<10}{1:<15}{2:<10}{3:<10}{4:<10}{5:<10}{6:<10}{7:<10}{8:<10}{9:<10}'.format(element[0],element[1],'_',element[2],element[2],'_',element[3],'_','_','_'))
			wfile.write(str(element[0])+'\t'+str(element[1])+'\t'+'_'+'\t'+str(element[2])+'\t'+str(element[2])+'\t'+'_'+'\t'+str(element[3])+'\t'+str(element[4])+'\t'+'_'+'\t'+'_')
			wfile.write("\n")	
	wfile.write("\r\n")

wfile.close()
示例#21
0
class Gender:
    vowels = ['a', 'e', 'i', 'o', 'u', 'y']

    def char_is(self, n, char):
        return lambda word: abs(n) < len(word) and word[n] == char

    def length(self, word):
        return len(word)

    def char_in(self, char):
        return lambda word: char in word

    def letter_vowel(self, char):
        return char in self.vowels

    def num_vowels(self, word):
        return len([c for c in word if self.letter_vowel(c)])

    def num_consonants(self, word):
        return len(word) - self.num_vowels(word)

    def vowel_ratio(self, word):
        return self.num_vowels(word)/float(len(word))

    def is_vowel(self, n):
        return lambda word: abs(n) < len(word) and word[n] in self.vowels

    def contains(self, c):
        return lambda word: c in word.lower()

    def is_capital(self, n):
        return lambda word: abs(n) < len(word) and word[n].isupper()

    def __init__(self):
        features = [self.length,
                    self.num_vowels,
                    self.num_consonants,
                    self.vowel_ratio,
                    self.is_capital(0)]

        for c1 in string.ascii_lowercase:
            features.append(self.contains(c1))
            for c2 in string.ascii_lowercase:
                features.append(self.contains(c1 + c2))

        for i in range(0, 15):
            for c in string.ascii_lowercase:
                features.append(self.char_is(i, c))
            features.append(self.is_vowel(i))

        for i in range(1, 5):
            for c in string.ascii_lowercase:
                features.append(self.char_is(-1*i, c))
            features.append(self.is_vowel(-1*i))

        self.classifier = MaxEnt(classes=["male", "female", "other"],
                                 features=features)
    def train(self, names):
        return self.classifier.train(names)

    def guess(self, word):
        if word in ['she', 'her']:
            return 'female'
        elif word in ['he', 'him']:
            return 'male'
        else:
            return self.classifier.predict(word)
示例#22
0
 def test_names_nltk(self):
     train, dev, test = self.split_names_corpus()
     classifier = MaxEnt()
     classifier.train(train, dev)
     acc = accuracy(classifier, test)
     self.assertGreater(acc, 0.70)
 def test_reviews_bag(self):
     """Classify sentiment using bag-of-words"""
     train, dev, test = self.split_review_corpus(BagOfWords)
     classifier = MaxEnt()
     classifier.train(train, dev)
     self.assertGreater(accuracy(classifier, test), 0.55)
class BagOfWords(Document):
    def features(self):
        """Trivially tokenized words."""
        return "bagofwords"

class Name(Document):
    def features(self):
        name = self.data
        return ['First=%s' % name[0], 'Last=%s' % name[-1]]

class Bigram(Document):
    def features(self):
        #different features generating mode
        return "bigram"

classifier = MaxEnt()
instances = ReviewCorpus('yelp_reviews.json',document_class=BagOfWords)

# ##experiment 1
# print('experiment 1')
# y1 = []
# x = []
# lengths = [1000,10000,50000,100000,len(instances.documents)]
# for length in lengths:
#     score = classifier.train(instances, maxlength=length, batch_size=30, l2_value=0.1, dev_instances=None)
#     print("score:",score)
#     y1.append(score)
#     x.append(str(length))
#
# plt.plot(x,y1)
# for xy in zip(x, y1):
示例#25
0
        self.assertGreater(acc, 0.70)
    
    def split_review_corpus(self, document_class):
        """Split the yelp review corpus into training, dev, and test sets"""
        reviews = ReviewCorpus('yelp_reviews.json', document_class=document_class)
        seed(hash("reviews"))
        shuffle(reviews)
        return (reviews[:10000], reviews[10000:11000], reviews[11000:14000])

    def test_reviews_bag(self):
        """Classify sentiment using bag-of-words"""
        train, dev, test = self.split_review_corpus(BagOfWords)
        classifier = MaxEnt()
        classifier.train(train, dev)
        self.assertGreater(accuracy(classifier, test), 0.55)
'''
if __name__ == '__main__':
    docs = Corpus('./gold_standard_all.txt', True)
    seed(time.time())
    shuffle(docs)

    first_80 = round(0.8 * len(docs))
    second_10 = first_80 + round(0.1 * len(docs))

    train, dev, test = (docs[:first_80], docs[first_80:second_10],
                        docs[second_10:])

    classifier = MaxEnt()
    classifier.train(train, dev)
    print(accuracy(classifier, test))
def main():
	parser = AP.ArgumentParser(description = "A command-line interface for " \
		"the maximum entropy classifier.")
	parser.add_argument("-d", "--datafile", action = "store", default = "blog-gender-dataset.txt", \
		help = "specify the input data file (default: ")
	parser.add_argument("-g", "--gaussian_prior", dest = "gpv", action = "store", \
		help = "specify the Gaussian prior variance")
	parser.add_argument("-m", "--mode", dest = "mode", action = "store", default = "train", \
		help = "run as train, train/ test, exp(eriment)1, exp(eriment)2, exp(eriment)3")
	parser.add_argument("-s", "--save", dest = "outfile", action = "store", default = None, \
		help = "specify output file to serialize trained classifier")
	parser.add_argument("-l", "--load", dest = "infile", action = "store", default = None, \
		help = "specify input file to load trained classifier")
	parser.add_argument("-i", "--instances", dest = "instances", action = "store", default = None, \
		help = "load preprocessed instances instead of data")
	parser.add_argument("-f", "--featurefile", dest = "featfile", action = "store", default = None, \
		help = "serialize preprocessed instances")	
	
	args = parser.parse_args() #parse argument structure
	
	#begin running classifier
	try:
		print "Importing data ... "
		if args.instances: #get serialized features
			instance_list = cPickle.load(open(args.instances, 'rb'))
			print "Done."
		else: #create features from data
			data_list = import_data(args.datafile)
			print "Done.\nExtracting features ... "
			instance_list = []
			l = len(data_list)
			for i, (label, post) in enumerate(data_list):
				print "Featurizing string %d of %d ... " % (i, l)
				instance_list.append(Instance(label = label, data = featurize(post)))
			print "Done."
		if args.featfile: #serialize instance_list
			with open(args.featfile, 'wb') as outf:
				cPickle.dump(instance_list, outf)
		piv1 = int(.7 * len(instance_list)) #split training from test
		piv2 = int(.9 * len(instance_list)) #split test from dev
		training, test, dev = instance_list[:piv1], instance_list[piv1:piv2], \
			instance_list[piv2:]
			
		if args.infile: #load a previously trained classifier
			with open(args.infile, 'rb') as inf:
				me_classifier = MaxEnt.from_dict(cPickle.load(inf))
		else: #create a new classifier
			exec('me_classifier = MaxEnt(%s)' % args.gpv)

		#experiment one
		if re.search(r'exp.*1', args.mode):
		
			if not args.infile:
				print "Training classifier ... "
				me_classifier.train(training)
				print "Done.\nTesting classification ... "
			if args.outfile:
				with open(args.outfile, 'wb') as outf:
					cPickle.dump(me_classifier.to_dict(), outf)
		
			for data in [training, test]:
				test_classifier(me_classifier, data).print_out()
				
		#experiment two; run in batch as for i in {.05,...,numpy.Infinity} ...
		#run with -s $i.classifier
		elif re.search(r'exp.*2', args.mode):
			#for value in [.05, 0.1, .5, 1, 3, 5, 10, numpy.Infinity]:
			#for value in [10, numpy.Infinity]:
			#me_classifier = MaxEnt(value)
			print "Training classifier with Gaussian prior variance %s ..." \
				% str(me_classifier.gaussian_prior_variance)
			me_classifier.train(training)
			print "Done. Testing classifier over dev set ..."
			test_classifier(me_classifier, dev).print_out()
			print "Done. Testing classifier over test set ..."
			test_classifier(me_classifier, test).print_out()
			print "Done.\n\n\n"
			
		#experiment three; run with -l 1.classifier
		elif re.search(r'exp.*3', args.mode):
			if not args.infile:
				print "Training Maximum Entropy classifier ... "
				me_classifier.train(training)
				print "Done."
			nb_classifier = NaiveBayes()
			print "Training Naive Bayes classifier ... "
			nb_classifier.train(training)
			print "Done.\nTesting Maximum Entropy over test set ... "
			test_classifier(me_classifier, test).print_out()
			print "Done.\nTesting Naive Bayes over test set ... "
			test_classifier(nb_classifier, test).print_out()
			
		if args.outfile: #serialize trained classifier
			with open(args.outfile, 'wb') as outf:
				cPickle.dump(me_classifier.to_dict(), outf)

	except: #something is WROOOONG
		parser.print_help()
		raise
    for i in range(reps):
        print(">>>>iteration", i)

        reward, states_visited, steps = run_episode(env, valueFunction, n,
                                                    False, EPSILON)
        #compute feature counts
        fcounts = compute_feature_counts(fMap, states_visited, discount, env)
        print("steps = ", steps)
        #print("feature count = ", fcounts)
        features.append(fcounts)

    features = np.array(features)

    #compute expected feature counts for demos
    emp_feature_cnts = np.mean(features[skip_time:], axis=0)
    ment = MaxEnt(solve_mdp, fMap, env, num_fcount_rollouts, discount)
    maxent_value_fn = ment.get_opt_policy(emp_feature_cnts, learning_rate,
                                          num_steps)

    #pickle the controller (value function)
    #with open('mcar_maxent_policy_ss.pickle', 'wb') as f:
    #    pickle.dump(maxent_value_fn, f, pickle.HIGHEST_PROTOCOL)

    #with open('mcar_maxent_policy_ss.pickle', 'rb') as f:
    #    vFunc = pickle.load(f)

    #evaluate maxent learned policy
    returns = evaluate_softmax_policy(env, eval_rollouts, maxent_value_fn)
    print("average return", np.mean(returns))

    for r in returns:
示例#28
0
 def test_reviews_words(self):
     """Classify sentiment using sentiment words"""
     train, dev, test = self.split_review_corpus(SentimentWords)
     classifier = MaxEnt()
     classifier.train(train, dev)
     self.assertGreater(accuracy(classifier, test), 0.55)
示例#29
0
 def test_reviews_bag(self):
     train, dev, test = self.split_review_corpus(BagOfWords)
     classifier = MaxEnt()
     classifier.train(train, dev)
     self.assertGreater(accuracy(classifier, test), 0.55)