Exemplo n.º 1
0
#   =================================================================
#
#   Last update of any of the components of this module:
#
#   March 25, 2008.
#
#   Users are encouraged to download periodically updated versions of
#   this code at the TANGO home page:
#
#   www.ime.usp.br/~egbirgin/tango/
#
#   *****************************************************************
#   *****************************************************************

#   Load the problem definition file

from toyprob import *

#   Import the Python module

import algencan

#   Set some optional params

algencan.param['iprint'] = 10

#   Call the solver

algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,
                 evalhlp,inip,endp)
def main() :
	global fractional_counts_language ,fractional_counts_channel,probabilities_channel,probabilities_language,sigma,alpha,current_fractional_counts,current_optimization_params,init_option,initial_parameter_args,constraint_tags_dict,parameter_counter,current_optimization_tag

	num_iterations = int(sys.argv[1])
	alpha = float(sys.argv[2])
	sigma = float(sys.argv[3])

	dictionary = emMethods.createDictionary('complete.dict.new-formatted')#.small')
	word_lines= emMethods.readWordLines('test.words.new-formatted')

	#word_list_five = emMethods.readWordList('TEXT.3.linear')
	gold_tag_sequence = emMethods.readWordList('test.tags.new-formatted.linear')

	free_parameters_channel = {}
	free_parameters_language = {}
	print 'starting to create parameters'
	total_language_parameters = 0
	total_channel_parameters = 0
	for line in word_lines :
		#print 'created parameters for a line'
		(language_parameters,channel_parameters) = emMethods.getFreeParametersBigram(line,dictionary,free_parameters_language,free_parameters_channel)
		#print language_parameters
		#print channel_parameters

		total_language_parameters += language_parameters
		total_channel_parameters += channel_parameters
	print 'total language parameters is %d' %(total_language_parameters)
	print 'total channel parameters is %d' %(total_channel_parameters)
	#now, we will build all the lattices, and create a special start node and end node for every sentence
	start_node_end_node_list = []
	print 'constraint_lengths are being printed'
	for tag in free_parameters_language.keys() :
		print len(free_parameters_language[tag].keys())
	#raw_input()
	sys.exit()
#	print len(word_list)
#	num_taggings = emMethods.getNumTaggings(word_list,dictionary)
#	print 'num_taggings '
#	print type(num_taggings)
	#print num_taggings
	fractional_counts_language = copy.deepcopy(free_parameters_language)
	fractional_counts_channel = copy.deepcopy(free_parameters_channel)
	probabilities_channel = copy.deepcopy(free_parameters_channel)
	probabilities_language = copy.deepcopy(free_parameters_language)
	emMethods.initUniformProbs(probabilities_channel,probabilities_language)
#	emMethods.initUniformProbs(probabilities_language,probabilities_channel)
	emMethods.writeFsa('tagging.fsa',probabilities_language)
	emMethods.writeFst('tagging.fst',probabilities_channel)


	run_training = r'./carmel.static --train-cascade -M 0 -m -HJ test.words.new-formatted.training tagging.fsa tagging.fst'
#		skel_size += len(col)
	#running the EM iterations
	#we are creating the indexes for algencan . Notice that here, the probabilities language is already uniform and therefore none of them will be zero
	createParametersForScaling(probabilities_language)
	for i in range (0,num_iterations) :
		'''
		print 'checking the initial zeros inlanguage'
		checkZeros(probabilities_language)
	
		print 'checking the initial zeros in channel'
		checkZeros(probabilities_channel)
		'''
		#best_tag_sequence = emMethods.viterbiSearch(start,end,probabilities_channel,probabilities_language,lattice_skeleton)
		#emMethods.calcAccuracy(gold_tag_sequence,best_tag_sequence)
		#raw_input()
		#this will create the parameters
		total_corpus_probability = 0.0
		(status,output) = commands.getstatusoutput(run_training)	
		print 'we just ran the training'
		prob_match = probability_re.search(output)
		if prob_match == None :
			print'we should have found a probability'
		else :
			print 'the probability is %s'%prob_match.group(1)
		total_corpus_probability = float(prob_match.group(1)[2:len(prob_match.group(1))])
		print 'reading language fractional counts'
		emMethods.readCarmelFractionalCounts('tagging.fsa.trained',fractional_counts_language,'bigram')
		print 'read the fsa'
		print 'reading channel fractional counts'
		emMethods.readCarmelFractionalCounts('tagging.fst.trained',fractional_counts_channel,'channel')
		print 'read the fst'
			
		print' the probability of the corpus was %f' %total_corpus_probability
		print 'we are now checking the accuracies'
		noe_command = 'cat tagging.fsa | sed \'s/*e*//g\' > tagging.fsa.noe'
		(status,output) = commands.getstatusoutput(noe_command)
		print 'we wrote the noe fsa'
		viterbi_command = r'cat test.words.new-formatted.quotes | ./carmel.static -srbk -QEWI 1 tagging.fsa.noe tagging.fst > tagging_output'
		(status,output) = commands.getstatusoutput(viterbi_command)
		tagged_sequence = emMethods.readTaggingOutput('tagging_output')	
		accuracy = emMethods.calcAccuracy(gold_tag_sequence,tagged_sequence)
		print 'The accuracy was %s and the objective function value was %s'%(str(accuracy),str(evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma)))

		#emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language)
		#fractional_counts_language = copy.deepcopy(free_parameters_language)
		#fractional_counts_channel = copy.deepcopy(free_parameters_channel)

		#first optimizing the tag bigrams and doing it per parameter
		for tag in initial_parameter_args.keys() :
			if len(initial_parameter_args[tag].keys()) == 1 :
				continue
			#current_initial_parameter_args = initial_parameter_args[tag]
			current_optimization_tag = tag
			parameter_counter = len(initial_parameter_args[tag].keys())			
			current_fractional_counts = fractional_counts_language
			constraint_tags_dict[1] = tag
			temp_language_probs = dict(probabilities_language)
			#optimizing per constraint
			init_option = 'current_prob'
			current_optimization_params = 'tag_bigrams'
			algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp)
		
			
			
			#I should check if the objective function is increasing 
			language_probs_after_init_current_prob = copy.deepcopy(probabilities_language)
			#obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma)
			total_obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma)
	
			obj_val1 = evaluateOptimizationFunction(initial_parameter_args,probabilities_language,fractional_counts_language,alpha,sigma) 
			print 'the function value was obj 1 %f'%obj_val1
   			#emMethods.clearAlphaBeta(lattice_skeleton)
	
	
			init_option = 'zeros'
			current_optimization_params = 'tag_bigrams'
			algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp)
		
			
	
			language_probs_after_init_zeros = copy.deepcopy(probabilities_language)
			#obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma)a
			
			total_obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma)
			obj_val2 = evaluateOptimizationFunction(initial_parameter_args,probabilities_language,fractional_counts_language,alpha,sigma) 
			print 'the function value was obj 2 %f'%obj_val2
   			#emMethods.clearAlphaBeta(lattice_skeleton)
	
			if (total_obj_val1 >= total_obj_val2) :
				#init_option = 'current_prob'
				#current_optimization_params = 'tag_bigrams'
				if (obj_val1 < obj_val2) :
					print 'the final objective function value was opposite'
	
				probabilities_language = copy.deepcopy(language_probs_after_init_current_prob)
				#algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp)
		
				print 'the final objective function value was obj 1 %f'%total_obj_val1
			
			else :
				#init_option = 'zeros'
				#current_optimization_params = 'tag_bigrams'
				probabilities_language = copy.deepcopy(language_probs_after_init_zeros)
				#algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp)
		
				if (obj_val2 < obj_val1) :
					print 'the final objective function value was opposite'
	
				print 'the final objective function value was obj 2 %f'%total_obj_val2
				#raw_input()

#		emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language)
		emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language)
		print 'writing the fsa'
		#now writing the fsa back again
		emMethods.writeFsa('tagging.fsa',probabilities_language)
		emMethods.writeFst('tagging.fst',probabilities_channel)
		'''
		noe_command = 'cat tagging.fsa | sed \'s/*e*//g\' > tagging.fsa.noe'
		(status,output) = commands.getstatusoutput(noe_command)
		print 'we wrote the noe fsa'
		viterbi_command = r'cat test.words.new-formatted.quotes | ./carmel.static -srbk -QEWI 1 tagging.fsa.noe tagging.fst > tagging_output'
		(status,output) = commands.getstatusoutput(viterbi_command)
		tagged_sequence = emMethods.readTaggingOutput('tagging_output')	
		emMethods.calcAccuracy(gold_tag_sequence,tagged_sequence)
		'''
		print 'checking the zeros in tag bigram model'
		checkZeros(probabilities_language)
	
		print 'checking the initial zeros in channel model'
		checkZeros(probabilities_channel)

		#fractional_counts_language.clear()
		#fractional_counts_channel.clear()
		fractional_counts_language = copy.deepcopy(free_parameters_language)
		fractional_counts_channel = copy.deepcopy(free_parameters_channel)
		#probabilities_language = copy.deepcopy(free_parameters_language)
		probabilities_channel = copy.deepcopy(free_parameters_channel)
Exemplo n.º 3
0
#   Module: Main program
#   =================================================================
#
#   Last update of any of the components of this module:
#
#   March 25, 2008.
#
#   Users are encouraged to download periodically updated versions of
#   this code at the TANGO home page:
#
#   www.ime.usp.br/~egbirgin/tango/
#
#   *****************************************************************
#   *****************************************************************

#   Load the problem definition file

from toyprob import *

#   Import the Python module

import algencan

#   Set some optional params

algencan.param["iprint"] = 10

#   Call the solver

algencan.solvers(evalf, evalg, evalh, evalc, evaljac, evalhc, evalfc, evalgjac, evalhl, evalhlp, inip, endp)
def main() :
	global fractional_counts_language ,fractional_counts_channel,probabilities_channel,sigma,alpha,current_fractional_counts,current_optimization_params,init_option

	num_iterations = int(sys.argv[1])
	alpha = float(sys.argv[2])
	sigma = float(sys.argv[3])

#	dictionary = emMethods.createDictionary('DICT2')#.small')
#	word_list = emMethods.readWordList('TEXT.linear')
	#word_list_five = emMethods.readWordList('TEXT.5.linear')
#	gold_tag_sequence = emMethods.readWordList('GOLD.linear')
	gold_cipher = emMethods.readCipherFile('cipher.gold.noq')
	print gold_cipher
	#dictionary = emMethods.createDictionary('complete.dict.new-formatted')#.small')
	#word_lines= emMethods.readWordLines('test.words.new-formatted')
	cipher_letter_dict = emMethods.getUniqCipherLetters('cipher.data.noq')
	#word_list_five = emMethods.readWordList('TEXT.3.linear')
	#plaintext = map(chr, range(97, 123))
	plaintext = []
	for k in range(65, 91):
		plaintext.append(chr(k))
	print plaintext
	print 'the number of unique cipher letter is %d'%len(cipher_letter_dict.keys())
	print cipher_letter_dict
	#gold_tag_sequence = emMethods.readWordList('test.tags.new-formatted.linear')
 		
	free_parameters_channel = {}
	free_parameters_language = {}
	print 'starting to create parameters'
	total_language_parameters = 0
	total_channel_parameters = 0
	#for line in cipher_lines :
		#print 'created parameters for a line'
		#(language_parameters,channel_parameters) = emMethods.getFreeParametersBigram(line,dictionary,free_parameters_language,free_parameters_channel)
	emMethods.getFreeCipherParametersChannel(cipher_letter_dict,plaintext,free_parameters_channel)
	temp = {'_':0.0}
	free_parameters_channel['_'] = temp
	#print free_parameters_channel
	#sys.exit()
		#print language_parameters
		#print channel_parameters

		#total_language_parameters += language_parameters
		#total_channel_parameters += channel_parameters
	#print 'total language parameters is %d' %(total_language_parameters)
	#print 'total channel parameters is %d' %(total_channel_parameters)
	#now, we will build all the lattices, and create a special start node and end node for every sentence
	start_node_end_node_list = []

#	print len(word_list)
#	num_taggings = emMethods.getNumTaggings(word_list,dictionary)
#	print 'num_taggings '
#	print type(num_taggings)
	#print num_taggings
	fractional_counts_channel = copy.deepcopy(free_parameters_channel)
	probabilities_channel = copy.deepcopy(free_parameters_channel)
	emMethods.initUniformProbs(probabilities_channel)
#	emMethods.initUniformProbs(probabilities_language,probabilities_channel)
	emMethods.writeFst('cipher.fst',probabilities_channel)


	run_training = r'./carmel.static --train-cascade -M 0 -m -HJ cipher.data cipher.wfsa cipher.fst'
#		skel_size += len(col)
	#running the EM iterations
	for i in range (0,num_iterations) :
		'''
		print 'checking the initial zeros inlanguage'
		checkZeros(probabilities_language)
	
		print 'checking the initial zeros in channel'
		checkZeros(probabilities_channel)
		'''
		#best_tag_sequence = emMethods.viterbiSearch(start,end,probabilities_channel,probabilities_language,lattice_skeleton)
		#emMethods.calcAccuracy(gold_tag_sequence,best_tag_sequence)
		#raw_input()
		#this will create the parameters
		total_corpus_probability = 0.0
		(status,output) = commands.getstatusoutput(run_training)	
		print 'we just ran the training'
		prob_match = probability_re.search(output)
		if prob_match == None :
			print'we should have found a probability'
		else :
			print 'the probability is %s'%prob_match.group(1)
		total_corpus_probability = float(prob_match.group(1)[2:len(prob_match.group(1))])
		print 'reading channel fractional counts'
		emMethods.readCarmelFractionalCounts('cipher.fst.trained',fractional_counts_channel,'channel')
		print 'read the fst'
			
		print' the probability of the corpus was %f' %total_corpus_probability
		print 'we are now checking the accuracies'
		noe_command = 'cat tagging.fsa | sed \'s/*e*//g\' > tagging.fsa.noe'
		(status,output) = commands.getstatusoutput(noe_command)
		print 'we wrote the noe fsa'
		viterbi_command = r'cat cipher.data | ./carmel.static -srbk -QEWI 1 cipher.wfsa.noe cipher.fst > decipherment_output'
		(status,output) = commands.getstatusoutput(viterbi_command)
		#tagged_sequence = emMethods.readTaggingOutput('tagging_output')	
		deciphered_sequence = emMethods.readCipherFile('decipherment_output')
		accuracy = emMethods.calcAccuracy(gold_cipher,deciphered_sequence)

		print 'The accuracy was %s and the objective function value was %s'%(str(accuracy),str(evaluateObjectiveFuncValue(total_corpus_probability,probabilities_channel,alpha,sigma)))


		#first optimizing the channel
		current_fractional_counts = fractional_counts_channel
		createParameters(probabilities_channel,current_fractional_counts,free_parameters_channel,alpha,sigma)
		temp_channel_probs = dict(probabilities_channel)	
		init_option = 'current_prob'
		current_optimization_params = 'channel'
		algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp)
	
		
		
		#I should check if the objective function is increasing 
		channel_probs_after_init_current_prob = copy.deepcopy(probabilities_channel)
		#obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma)
		total_obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_channel,alpha,sigma)

		obj_val1 = evaluateOptimizationFunction(initial_parameter_args,probabilities_channel,fractional_counts_channel,alpha,sigma) 
		print 'the function value was obj 1 %f'%obj_val1
   		#emMethods.clearAlphaBeta(lattice_skeleton)


		init_option = 'zeros'
		current_optimization_params = 'channel'
		algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp)
	
		channel_probs_after_init_zeros = copy.deepcopy(probabilities_channel)
		#obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma)a
		
		total_obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_channel,alpha,sigma)
		obj_val2 = evaluateOptimizationFunction(initial_parameter_args,probabilities_channel,fractional_counts_channel,alpha,sigma) 
		print 'the function value was obj 2 %f'%obj_val2
   		#emMethods.clearAlphaBeta(lattice_skeleton)

		if (total_obj_val1 >= total_obj_val2) :
			#init_option = 'current_prob'
			#current_optimization_params = 'tag_bigrams'
			if (obj_val1 < obj_val2) :
				print 'the final objective function value was opposite'

			probabilities_channel = copy.deepcopy(channel_probs_after_init_current_prob)
			#algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp)
	

			print 'the final objective function value was obj 1 %f'%total_obj_val1
		
		else :
			#init_option = 'zeros'
			#current_optimization_params = 'tag_bigrams'
			probabilities_channel = copy.deepcopy(channel_probs_after_init_zeros)
	
	
			if (obj_val2 < obj_val1) :
				print 'the final objective function value was opposite'

			print 'the final objective function value was obj 2 %f'%total_obj_val2
			#raw_input()

#		emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language)
		#emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language)
		#print 'writing the fsa'
		#now writing the fsa back again
		#emMethods.writeFsa('tagging.fsa',probabilities_language)
		emMethods.writeFst('cipher.fst',probabilities_channel)
		#print 'checking the zeros in tag bigram model'
		#checkZeros(probabilities_language)
	
		print 'checking the zeros in channel model'
		checkZeros(probabilities_channel)

		#fractional_counts_language.clear()
		#fractional_counts_channel.clear()
		#fractional_counts_language = copy.deepcopy(free_parameters_language)
		fractional_counts_channel = copy.deepcopy(free_parameters_channel)