# ================================================================= # # Last update of any of the components of this module: # # March 25, 2008. # # Users are encouraged to download periodically updated versions of # this code at the TANGO home page: # # www.ime.usp.br/~egbirgin/tango/ # # ***************************************************************** # ***************************************************************** # Load the problem definition file from toyprob import * # Import the Python module import algencan # Set some optional params algencan.param['iprint'] = 10 # Call the solver algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl, evalhlp,inip,endp)
def main() : global fractional_counts_language ,fractional_counts_channel,probabilities_channel,probabilities_language,sigma,alpha,current_fractional_counts,current_optimization_params,init_option,initial_parameter_args,constraint_tags_dict,parameter_counter,current_optimization_tag num_iterations = int(sys.argv[1]) alpha = float(sys.argv[2]) sigma = float(sys.argv[3]) dictionary = emMethods.createDictionary('complete.dict.new-formatted')#.small') word_lines= emMethods.readWordLines('test.words.new-formatted') #word_list_five = emMethods.readWordList('TEXT.3.linear') gold_tag_sequence = emMethods.readWordList('test.tags.new-formatted.linear') free_parameters_channel = {} free_parameters_language = {} print 'starting to create parameters' total_language_parameters = 0 total_channel_parameters = 0 for line in word_lines : #print 'created parameters for a line' (language_parameters,channel_parameters) = emMethods.getFreeParametersBigram(line,dictionary,free_parameters_language,free_parameters_channel) #print language_parameters #print channel_parameters total_language_parameters += language_parameters total_channel_parameters += channel_parameters print 'total language parameters is %d' %(total_language_parameters) print 'total channel parameters is %d' %(total_channel_parameters) #now, we will build all the lattices, and create a special start node and end node for every sentence start_node_end_node_list = [] print 'constraint_lengths are being printed' for tag in free_parameters_language.keys() : print len(free_parameters_language[tag].keys()) #raw_input() sys.exit() # print len(word_list) # num_taggings = emMethods.getNumTaggings(word_list,dictionary) # print 'num_taggings ' # print type(num_taggings) #print num_taggings fractional_counts_language = copy.deepcopy(free_parameters_language) fractional_counts_channel = copy.deepcopy(free_parameters_channel) probabilities_channel = copy.deepcopy(free_parameters_channel) probabilities_language = copy.deepcopy(free_parameters_language) emMethods.initUniformProbs(probabilities_channel,probabilities_language) # emMethods.initUniformProbs(probabilities_language,probabilities_channel) emMethods.writeFsa('tagging.fsa',probabilities_language) emMethods.writeFst('tagging.fst',probabilities_channel) run_training = r'./carmel.static --train-cascade -M 0 -m -HJ test.words.new-formatted.training tagging.fsa tagging.fst' # skel_size += len(col) #running the EM iterations #we are creating the indexes for algencan . Notice that here, the probabilities language is already uniform and therefore none of them will be zero createParametersForScaling(probabilities_language) for i in range (0,num_iterations) : ''' print 'checking the initial zeros inlanguage' checkZeros(probabilities_language) print 'checking the initial zeros in channel' checkZeros(probabilities_channel) ''' #best_tag_sequence = emMethods.viterbiSearch(start,end,probabilities_channel,probabilities_language,lattice_skeleton) #emMethods.calcAccuracy(gold_tag_sequence,best_tag_sequence) #raw_input() #this will create the parameters total_corpus_probability = 0.0 (status,output) = commands.getstatusoutput(run_training) print 'we just ran the training' prob_match = probability_re.search(output) if prob_match == None : print'we should have found a probability' else : print 'the probability is %s'%prob_match.group(1) total_corpus_probability = float(prob_match.group(1)[2:len(prob_match.group(1))]) print 'reading language fractional counts' emMethods.readCarmelFractionalCounts('tagging.fsa.trained',fractional_counts_language,'bigram') print 'read the fsa' print 'reading channel fractional counts' emMethods.readCarmelFractionalCounts('tagging.fst.trained',fractional_counts_channel,'channel') print 'read the fst' print' the probability of the corpus was %f' %total_corpus_probability print 'we are now checking the accuracies' noe_command = 'cat tagging.fsa | sed \'s/*e*//g\' > tagging.fsa.noe' (status,output) = commands.getstatusoutput(noe_command) print 'we wrote the noe fsa' viterbi_command = r'cat test.words.new-formatted.quotes | ./carmel.static -srbk -QEWI 1 tagging.fsa.noe tagging.fst > tagging_output' (status,output) = commands.getstatusoutput(viterbi_command) tagged_sequence = emMethods.readTaggingOutput('tagging_output') accuracy = emMethods.calcAccuracy(gold_tag_sequence,tagged_sequence) print 'The accuracy was %s and the objective function value was %s'%(str(accuracy),str(evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma))) #emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language) #fractional_counts_language = copy.deepcopy(free_parameters_language) #fractional_counts_channel = copy.deepcopy(free_parameters_channel) #first optimizing the tag bigrams and doing it per parameter for tag in initial_parameter_args.keys() : if len(initial_parameter_args[tag].keys()) == 1 : continue #current_initial_parameter_args = initial_parameter_args[tag] current_optimization_tag = tag parameter_counter = len(initial_parameter_args[tag].keys()) current_fractional_counts = fractional_counts_language constraint_tags_dict[1] = tag temp_language_probs = dict(probabilities_language) #optimizing per constraint init_option = 'current_prob' current_optimization_params = 'tag_bigrams' algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) #I should check if the objective function is increasing language_probs_after_init_current_prob = copy.deepcopy(probabilities_language) #obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma) total_obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma) obj_val1 = evaluateOptimizationFunction(initial_parameter_args,probabilities_language,fractional_counts_language,alpha,sigma) print 'the function value was obj 1 %f'%obj_val1 #emMethods.clearAlphaBeta(lattice_skeleton) init_option = 'zeros' current_optimization_params = 'tag_bigrams' algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) language_probs_after_init_zeros = copy.deepcopy(probabilities_language) #obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma)a total_obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma) obj_val2 = evaluateOptimizationFunction(initial_parameter_args,probabilities_language,fractional_counts_language,alpha,sigma) print 'the function value was obj 2 %f'%obj_val2 #emMethods.clearAlphaBeta(lattice_skeleton) if (total_obj_val1 >= total_obj_val2) : #init_option = 'current_prob' #current_optimization_params = 'tag_bigrams' if (obj_val1 < obj_val2) : print 'the final objective function value was opposite' probabilities_language = copy.deepcopy(language_probs_after_init_current_prob) #algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) print 'the final objective function value was obj 1 %f'%total_obj_val1 else : #init_option = 'zeros' #current_optimization_params = 'tag_bigrams' probabilities_language = copy.deepcopy(language_probs_after_init_zeros) #algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) if (obj_val2 < obj_val1) : print 'the final objective function value was opposite' print 'the final objective function value was obj 2 %f'%total_obj_val2 #raw_input() # emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language) emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language) print 'writing the fsa' #now writing the fsa back again emMethods.writeFsa('tagging.fsa',probabilities_language) emMethods.writeFst('tagging.fst',probabilities_channel) ''' noe_command = 'cat tagging.fsa | sed \'s/*e*//g\' > tagging.fsa.noe' (status,output) = commands.getstatusoutput(noe_command) print 'we wrote the noe fsa' viterbi_command = r'cat test.words.new-formatted.quotes | ./carmel.static -srbk -QEWI 1 tagging.fsa.noe tagging.fst > tagging_output' (status,output) = commands.getstatusoutput(viterbi_command) tagged_sequence = emMethods.readTaggingOutput('tagging_output') emMethods.calcAccuracy(gold_tag_sequence,tagged_sequence) ''' print 'checking the zeros in tag bigram model' checkZeros(probabilities_language) print 'checking the initial zeros in channel model' checkZeros(probabilities_channel) #fractional_counts_language.clear() #fractional_counts_channel.clear() fractional_counts_language = copy.deepcopy(free_parameters_language) fractional_counts_channel = copy.deepcopy(free_parameters_channel) #probabilities_language = copy.deepcopy(free_parameters_language) probabilities_channel = copy.deepcopy(free_parameters_channel)
# Module: Main program # ================================================================= # # Last update of any of the components of this module: # # March 25, 2008. # # Users are encouraged to download periodically updated versions of # this code at the TANGO home page: # # www.ime.usp.br/~egbirgin/tango/ # # ***************************************************************** # ***************************************************************** # Load the problem definition file from toyprob import * # Import the Python module import algencan # Set some optional params algencan.param["iprint"] = 10 # Call the solver algencan.solvers(evalf, evalg, evalh, evalc, evaljac, evalhc, evalfc, evalgjac, evalhl, evalhlp, inip, endp)
def main() : global fractional_counts_language ,fractional_counts_channel,probabilities_channel,sigma,alpha,current_fractional_counts,current_optimization_params,init_option num_iterations = int(sys.argv[1]) alpha = float(sys.argv[2]) sigma = float(sys.argv[3]) # dictionary = emMethods.createDictionary('DICT2')#.small') # word_list = emMethods.readWordList('TEXT.linear') #word_list_five = emMethods.readWordList('TEXT.5.linear') # gold_tag_sequence = emMethods.readWordList('GOLD.linear') gold_cipher = emMethods.readCipherFile('cipher.gold.noq') print gold_cipher #dictionary = emMethods.createDictionary('complete.dict.new-formatted')#.small') #word_lines= emMethods.readWordLines('test.words.new-formatted') cipher_letter_dict = emMethods.getUniqCipherLetters('cipher.data.noq') #word_list_five = emMethods.readWordList('TEXT.3.linear') #plaintext = map(chr, range(97, 123)) plaintext = [] for k in range(65, 91): plaintext.append(chr(k)) print plaintext print 'the number of unique cipher letter is %d'%len(cipher_letter_dict.keys()) print cipher_letter_dict #gold_tag_sequence = emMethods.readWordList('test.tags.new-formatted.linear') free_parameters_channel = {} free_parameters_language = {} print 'starting to create parameters' total_language_parameters = 0 total_channel_parameters = 0 #for line in cipher_lines : #print 'created parameters for a line' #(language_parameters,channel_parameters) = emMethods.getFreeParametersBigram(line,dictionary,free_parameters_language,free_parameters_channel) emMethods.getFreeCipherParametersChannel(cipher_letter_dict,plaintext,free_parameters_channel) temp = {'_':0.0} free_parameters_channel['_'] = temp #print free_parameters_channel #sys.exit() #print language_parameters #print channel_parameters #total_language_parameters += language_parameters #total_channel_parameters += channel_parameters #print 'total language parameters is %d' %(total_language_parameters) #print 'total channel parameters is %d' %(total_channel_parameters) #now, we will build all the lattices, and create a special start node and end node for every sentence start_node_end_node_list = [] # print len(word_list) # num_taggings = emMethods.getNumTaggings(word_list,dictionary) # print 'num_taggings ' # print type(num_taggings) #print num_taggings fractional_counts_channel = copy.deepcopy(free_parameters_channel) probabilities_channel = copy.deepcopy(free_parameters_channel) emMethods.initUniformProbs(probabilities_channel) # emMethods.initUniformProbs(probabilities_language,probabilities_channel) emMethods.writeFst('cipher.fst',probabilities_channel) run_training = r'./carmel.static --train-cascade -M 0 -m -HJ cipher.data cipher.wfsa cipher.fst' # skel_size += len(col) #running the EM iterations for i in range (0,num_iterations) : ''' print 'checking the initial zeros inlanguage' checkZeros(probabilities_language) print 'checking the initial zeros in channel' checkZeros(probabilities_channel) ''' #best_tag_sequence = emMethods.viterbiSearch(start,end,probabilities_channel,probabilities_language,lattice_skeleton) #emMethods.calcAccuracy(gold_tag_sequence,best_tag_sequence) #raw_input() #this will create the parameters total_corpus_probability = 0.0 (status,output) = commands.getstatusoutput(run_training) print 'we just ran the training' prob_match = probability_re.search(output) if prob_match == None : print'we should have found a probability' else : print 'the probability is %s'%prob_match.group(1) total_corpus_probability = float(prob_match.group(1)[2:len(prob_match.group(1))]) print 'reading channel fractional counts' emMethods.readCarmelFractionalCounts('cipher.fst.trained',fractional_counts_channel,'channel') print 'read the fst' print' the probability of the corpus was %f' %total_corpus_probability print 'we are now checking the accuracies' noe_command = 'cat tagging.fsa | sed \'s/*e*//g\' > tagging.fsa.noe' (status,output) = commands.getstatusoutput(noe_command) print 'we wrote the noe fsa' viterbi_command = r'cat cipher.data | ./carmel.static -srbk -QEWI 1 cipher.wfsa.noe cipher.fst > decipherment_output' (status,output) = commands.getstatusoutput(viterbi_command) #tagged_sequence = emMethods.readTaggingOutput('tagging_output') deciphered_sequence = emMethods.readCipherFile('decipherment_output') accuracy = emMethods.calcAccuracy(gold_cipher,deciphered_sequence) print 'The accuracy was %s and the objective function value was %s'%(str(accuracy),str(evaluateObjectiveFuncValue(total_corpus_probability,probabilities_channel,alpha,sigma))) #first optimizing the channel current_fractional_counts = fractional_counts_channel createParameters(probabilities_channel,current_fractional_counts,free_parameters_channel,alpha,sigma) temp_channel_probs = dict(probabilities_channel) init_option = 'current_prob' current_optimization_params = 'channel' algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) #I should check if the objective function is increasing channel_probs_after_init_current_prob = copy.deepcopy(probabilities_channel) #obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma) total_obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_channel,alpha,sigma) obj_val1 = evaluateOptimizationFunction(initial_parameter_args,probabilities_channel,fractional_counts_channel,alpha,sigma) print 'the function value was obj 1 %f'%obj_val1 #emMethods.clearAlphaBeta(lattice_skeleton) init_option = 'zeros' current_optimization_params = 'channel' algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) channel_probs_after_init_zeros = copy.deepcopy(probabilities_channel) #obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma)a total_obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_channel,alpha,sigma) obj_val2 = evaluateOptimizationFunction(initial_parameter_args,probabilities_channel,fractional_counts_channel,alpha,sigma) print 'the function value was obj 2 %f'%obj_val2 #emMethods.clearAlphaBeta(lattice_skeleton) if (total_obj_val1 >= total_obj_val2) : #init_option = 'current_prob' #current_optimization_params = 'tag_bigrams' if (obj_val1 < obj_val2) : print 'the final objective function value was opposite' probabilities_channel = copy.deepcopy(channel_probs_after_init_current_prob) #algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) print 'the final objective function value was obj 1 %f'%total_obj_val1 else : #init_option = 'zeros' #current_optimization_params = 'tag_bigrams' probabilities_channel = copy.deepcopy(channel_probs_after_init_zeros) if (obj_val2 < obj_val1) : print 'the final objective function value was opposite' print 'the final objective function value was obj 2 %f'%total_obj_val2 #raw_input() # emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language) #emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language) #print 'writing the fsa' #now writing the fsa back again #emMethods.writeFsa('tagging.fsa',probabilities_language) emMethods.writeFst('cipher.fst',probabilities_channel) #print 'checking the zeros in tag bigram model' #checkZeros(probabilities_language) print 'checking the zeros in channel model' checkZeros(probabilities_channel) #fractional_counts_language.clear() #fractional_counts_channel.clear() #fractional_counts_language = copy.deepcopy(free_parameters_language) fractional_counts_channel = copy.deepcopy(free_parameters_channel)