def main(): # Initialise a new HMM and train it test_hmm = simplehmm.hmm('Test HMM', test_hmm_states, test_hmm_observ) test_hmm.train(train_data) # Train the HMM test_hmm.check_prob() # Check its probabilities test_hmm.print_hmm() # Print it out # Apply the Viterbi algorithm to each sequence of the test data for test_rec in test_data: [state_sequence, sequence_probability] = test_hmm.viterbi(test_rec) # Initialise and train a second HMM using the same training data and # applying Laplace smoothing test_hmm2 = simplehmm.hmm('Test HMM 2', test_hmm_states, test_hmm_observ) test_hmm2.train(train_data, smoothing='laplace') # Save the second HMM into a text file test_hmm2.save_hmm('testhmm2.hmm') # Initialise a third HMM, then load the previously saved HMM into it test_hmm3 = simplehmm.hmm('Test HMM 3', ['dummy'], ['dummy']) test_hmm3.load_hmm('testhmm2.hmm') test_hmm3.print_hmm() # Print it out
def tracker(data, rid, trange): train_data = [] i = 0 while i < len(data): pre = [] for j in range(trange): i = i + j if i >= len(data): break else: pre.append((data[i][rid], data[i][2])) train_data.append(pre) states = ['1', '2', '3', '4', '5'] observ = ["".join(seq) for seq in itertools.product("01", repeat=5)] hmm1 = simplehmm.hmm('Test HMM', states, observ) hmm1.train(train_data) return hmm1
def tracker(data, rid, trange): train_data=[] i=0 while i< len(data): pre=[] for j in range(trange): i=i+j if i >= len(data): break else: pre.append((data[i][rid],data[i][2])) train_data.append(pre) states=['1','2','3','4','5'] observ=["".join(seq) for seq in itertools.product("01", repeat=5)] hmm1 = simplehmm.hmm('Test HMM', states, observ) hmm1.train(train_data) return hmm1
print 'Set of tags found in HMM training file:' print ' %s' % (', '.join(tag_list)) print print 'Set of HMM states found in HMM training file:' print ' %s' % (', '.join(state_list)) print print 'Parsed %d training records:' % (len(train_rec_list)) for train_rec in train_rec_list: print ' %s' % (train_rec) print # Initalise HMM and train it with training data - - - - - - - - - - - - - - - - # hmm_name = 'Febrl HMM based on training file "%s"' % (hmm_training_file) hmm_states = list(state_set) hmm_observ = list(tag_set) train_hmm = simplehmm.hmm(hmm_name, hmm_states, hmm_observ) # Train, print and save the HMM - - - - - - - - - - - - - - - - - - - - - - - - # train_hmm.train(train_rec_list, hmm_smoothing) train_hmm.print_hmm() # Save trained HMM - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # train_hmm.save_hmm(hmm_model_file) # =============================================================================
def testHMM(self): # - - - - - - - - - - - - - - - - - - - - - - - - - - - - """Test basic HMM functionality""" hmm1 = simplehmm.hmm('Test HMM', self.states, self.observ) assert (hmm1.N == len(self.states)), \ 'Illegal number of states in HMM ('+str(hmm1.N)+'), should be: '+ \ str(len(self.states)) assert (len(hmm1.S_ind) == len(self.states)), \ 'Illegal number of states in HMM state dictionary ('+ \ str(len(hmm1.S_ind))+'), should be: '+str(len(self.states)) assert (hmm1.M == len(self.observ)), \ 'Illegal number of observations in HMM ('+str(hmm1.M)+ \ '), should be: '+str(len(self.observ)) assert (len(hmm1.O_ind) == len(self.observ)), \ 'Illegal number of observations in HMM observation dictionary ('+ \ str(len(hmm1.O_ind))+'), should be: '+ str(len(self.observ)) for i in range(hmm1.N): assert (hmm1.pi[i] == 0.0), \ 'Initial probability in HMM 1 is not 0.0 at location ['+ \ str(i)+']: '+str(hmm1.pi[i]) for j in range(hmm1.N): assert (hmm1.A[i][j] == 0.0), \ 'Transition probability in HMM 1 is not 0.0 at location ['+ \ str(i)+','+str(j)+']: '+str(hmm1.A[i][j]) for j in range(hmm1.M): assert (hmm1.B[i][j] == 0.0), \ 'Observation probability in HMM 1 is not 0.0 at location ['+ \ str(i)+','+str(j)+']: '+str(hmm1.B[i][j]) hmm1.train(self.train_data) hmm1.check_prob() hmm1.print_hmm() for i in range(hmm1.N): assert ((hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0)), \ 'Initial probability in HMM 1 is not between 0.0 and 1.0 at '+ \ 'location ['+str(i)+']: '+str(hmm1.pi[i]) for j in range(hmm1.N): assert ((hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0)), \ 'Transition probability in HMM 1 is not between 0.0 and 1.0'+ \ ' at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j]) for j in range(hmm1.M): assert ((hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0)), \ 'Observation probability in HMM 1 is not between 0.0 and '+ \ '1.0 at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j]) for test_rec in self.test_data: [state_seq, seq_prob] = hmm1.viterbi(test_rec) for state in state_seq: assert (state in self.states), \ 'Returned state "'+state+'" not in tate list' assert ((seq_prob >= 0.0) and (seq_prob <= 1.0)), \ 'Sequence probability is not between 0.0 and 1.0:'+ str(seq_prob) hmm1.train(self.train_data,smoothing='laplace') hmm1.check_prob() hmm1.print_hmm() for i in range(hmm1.N): assert ((hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0)), \ 'Initial probability in HMM 1 is not between 0.0 and 1.0 at '+ \ 'location ['+str(i)+']: '+str(hmm1.pi[i]) for j in range(hmm1.N): assert ((hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0)), \ 'Transition probability in HMM 1 is not between 0.0 and 1.0'+ \ ' at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j]) for j in range(hmm1.M): assert ((hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0)), \ 'Observation probability in HMM 1 is not between 0.0 and '+ \ '1.0 at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j]) for test_rec in self.test_data: [state_seq, seq_prob] = hmm1.viterbi(test_rec) for state in state_seq: assert (state in self.states), \ 'Returned state "'+state+'" not in tate list' assert ((seq_prob >= 0.0) and (seq_prob <= 1.0)), \ 'Sequence probability is not between 0.0 and 1.0:'+ str(seq_prob) hmm1.train(self.train_data,smoothing='absdiscount') hmm1.check_prob() hmm1.print_hmm() for i in range(hmm1.N): assert ((hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0)), \ 'Initial probability in HMM 1 is not between 0.0 and 1.0 at '+ \ 'location ['+str(i)+']: '+str(hmm1.pi[i]) for j in range(hmm1.N): assert ((hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0)), \ 'Transition probability in HMM 1 is not between 0.0 and 1.0'+ \ ' at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j]) for j in range(hmm1.M): assert ((hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0)), \ 'Observation probability in HMM 1 is not between 0.0 and '+ \ '1.0 at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j]) for test_rec in self.test_data: [state_seq, seq_prob] = hmm1.viterbi(test_rec) for state in state_seq: assert (state in self.states), \ 'Returned state "'+state+'" not in tate list' assert ((seq_prob >= 0.0) and (seq_prob <= 1.0)), \ 'Sequence probability is not between 0.0 and 1.0:'+ str(seq_prob) hmm1.save_hmm('testhmm.hmm') hmm2 = hmm1 hmm2 = simplehmm.hmm('Test2 HMM', ['dummy'], ['dummy']) hmm2.load_hmm('testhmm.hmm') assert (hmm1.N == hmm2.N), \ 'Loaded HMM has differnt number of states' assert (hmm1.M == hmm2.M), \ 'Loaded HMM has differnt number of observations' for i in range(hmm1.N): assert (abs(hmm1.pi[i]- hmm2.pi[i]) < self.delta), \ 'Initial probability in HMM 1 is different from HMM 2: '+ \ str(hmm1.pi[i])+' / '+str(hmm2.pi[i]) for j in range(hmm1.N): assert (abs(hmm1.A[i][j] - hmm2.A[i][j]) < self.delta), \ 'Transition probability in HMM 1 is different from HMM 2 '+ \ 'at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j])+ \ ' / '+str(hmm2.A[i][j]) for j in range(hmm1.M): assert (abs(hmm1.B[i][j] - hmm1.B[i][j]) < self.delta), \ 'Observation probability in HMM 1 is different from HMM 2 '+ \ 'at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j])+ \ ' / '+str(hmm2.B[i][j])
def trainhmm(): """Main routine, open file, read lines, train HMM and save it to file. USAGE: trainhmm() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 3): print '***** Error: %s needs at least four arguments:'% (sys.argv[0]) print '***** - Name of the project module' print '***** - Tagging mode: "name" or "locality"' print '***** - Input training file name' print '***** - HMM output file name' print '***** plus options' raise Exception() if (config.options[1] == config.options[2]): print '*** Error: Input and output files must differ' print '*** Input training file name:', config.options[1] print '*** HMM output file name: ', config.options[1] raise Exception() in_file_name = config.options[1] hmm_file_name = config.options[2] # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - - # tag_mode = config.options[0] if (tag_mode in ['name','na','n']): tag_mode = 'name' elif (tag_mode in ['locality','lolty','loc','l']): tag_mode = 'loc' else: print '***** Error: Illegal tagging mode:', tag_mode print '***** Must be either "name" or "locality"' raise Exception() # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file smoothing = None # Default: No smoothing config.nowarn = 0 # Deactivate no warning flag (print/log warning # messages) if (len(config.options) > 3): options = config.options[3:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file,'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file: '+config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write('##################################################') f_log.write("############"+os.linesep) f_log.write("#"+os.linesep) f_log.write("# 'pyTrainHMM.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time())+os.linesep) f_log.write("#"+os.linesep) f_log.write("# Input file name: "+in_file_name+os.linesep) f_log.write("# HMM file name: "+hmm_file_name+os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-s'): smoothing = 1 # Set to do a HMM smoothing smoothing = options[1] if (smoothing in ['l','la','lap','laplac','laplace']): smoothing = 'laplace' elif (smoothing in ['a','ad','abs','absd','absdis','absdisc',\ 'absdiscount']): smoothing = 'absdiscount' else: # Illegal value print "*** Error: Illegal value for 'smoothing' argument:", smoothing print "*** Possible are: 'laplace' or 'absdiscount'" raise Exception() options = options[2:] # Remove processed option else: print '*** Error: Illegal option:', options[0] raise Exception() # Get HMM states and observations from configuration module - - - - - - - - - # if (tag_mode == 'name'): state_list = config.name_hmm_states obser_list = config.name_hmm_obser else: state_list = config.geoloc_hmm_states obser_list = config.geoloc_hmm_obser # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() line_count = 0 # Counter for lines read rec_count = 0 # Counter for training records read # Read lines, discard comment lines and process training data lines - - - - - # training_data = [] # List of training records train_list = [] # List of training sequences (dictionaries), extracted from # training data for line in xreadlines.xreadlines(f_in): if (line[0] != '#') and (line.strip() != ''): # Line must contain a training record line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line_list = line.split(',') # Split into a list of elements line_data = [] # Training data list for one training record inout.log_message(['Record number: '+str(rec_count)],'v1') config.curr_line_no = line_count # Store current line number for elem in line_list: [k,v] = elem.split(':') # Split into key and value tag = k.strip() state = v.strip() line_data.append((state,tag)) if (state not in state_list): msg = ['Illegal state name in training record: '+state, \ 'Line: '+str(line_count)+', record: '+str(rec_count), \ 'Possible values: '+str(state_list)] inout.log_message(msg,'err') raise Exception() if (tag not in obser_list): msg = ['Illegal observation (tag) name in training record: '+tag, \ 'Line: '+str(line_count)+', record: '+str(rec_count), \ 'Possible values: '+str(obser_list)] inout.log_message(msg,'err') raise Exception() inout.log_message(' Training record '+str(rec_count)+':'+ \ str(line_data),'v1') train_list.append(line_data) rec_count += 1 inout.log_message('','v1') # Print empty lines between records line_count += 1 # Close input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # f_in.close() inout.log_message('','v1') # Print empty lines between records # Initalise HMM and train it with training data - - - - - - - - - - - - - - - # myhmm = simplehmm.hmm(state_list, obser_list) myhmm.train(train_list,smoothing) myhmm.print_hmm() # Save trained HMM - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # myhmm.save_hmm(hmm_file_name) inout.log_message(['Read '+str(line_count)+' lines, processed '+ \ str(rec_count)+' training records', 'End.'],'v1')
__author__ = 'lucas Ferreira' import simplehmm import re treinamento=[] arquivo_motivos=open('teste.mtv','r').read() for motivo in arquivo_motivos.split('\n'): motivo_treino=[] contador=0 for x in re.findall('\w',motivo): motivo_treino.append([str(contador),x]) contador+=1 treinamento.append(motivo_treino) #print treinamento cirRNA_hmm=simplehmm.hmm('circ_rna primeiros testes',['0','1','2','3','4','5','6','7','8','9','10'], ['A','C','T','G']) cirRNA_hmm.train(treinamento, smoothing='absdiscount') cirRNA_hmm.print_hmm() cirRNA_hmm.save_hmm('circRNA_FRAGMENTADA.hmm')
treinar.append(train) print classificar_vetor print(teste.get_dir()) #teste.set_pontas_porcentagem(15) #print(teste.get_pontas()) #organizar #print (organizar[0]) #print(sorted(organizar)) # esse comando deve ser feito usando o primeiro objeto ZERO e nao UM print (sorted(train)) test_hmm_states = ['1','3', '2'] #test_hmm_observ=[] test_hmm = simplehmm.hmm('15 porcento lncRNA', test_hmm_states, nomes) #print test_hmm_observ ''' temos que colocar todos os arquivos train dentro da tabela com o append por meio de um FOR A TABELA DE TREINAMENTO TEM QUE SAIR ORGANIZADA E NaO ESTa ''' #treinar.append(train) print '---------------------',treinar,'---------------------------------' #print test_hmm.check_prob() test_hmm.train(treinar, smoothing='absdiscount') #print test_hmm.check_prob() #print test_hmm.print_hmm() #test_hmm.save_hmm("setubal/15_lncrna.hmm")
def testHMM(self): # - - - - - - - - - - - - - - - - - - - - - - - - - - - - """Test basic HMM functionality""" hmm1 = simplehmm.hmm("Test HMM", self.states, self.observ) assert hmm1.N == len(self.states), ( "Illegal number of states in HMM (" + str(hmm1.N) + "), should be: " + str(len(self.states)) ) assert len(hmm1.S_ind) == len(self.states), ( "Illegal number of states in HMM state dictionary (" + str(len(hmm1.S_ind)) + "), should be: " + str(len(self.states)) ) assert hmm1.M == len(self.observ), ( "Illegal number of observations in HMM (" + str(hmm1.M) + "), should be: " + str(len(self.observ)) ) assert len(hmm1.O_ind) == len(self.observ), ( "Illegal number of observations in HMM observation dictionary (" + str(len(hmm1.O_ind)) + "), should be: " + str(len(self.observ)) ) for i in range(hmm1.N): assert hmm1.pi[i] == 0.0, ( "Initial probability in HMM 1 is not 0.0 at location [" + str(i) + "]: " + str(hmm1.pi[i]) ) for j in range(hmm1.N): assert hmm1.A[i][j] == 0.0, ( "Transition probability in HMM 1 is not 0.0 at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.A[i][j]) ) for j in range(hmm1.M): assert hmm1.B[i][j] == 0.0, ( "Observation probability in HMM 1 is not 0.0 at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.B[i][j]) ) hmm1.train(self.train_data) hmm1.check_prob() hmm1.print_hmm() for i in range(hmm1.N): assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), ( "Initial probability in HMM 1 is not between 0.0 and 1.0 at " + "location [" + str(i) + "]: " + str(hmm1.pi[i]) ) for j in range(hmm1.N): assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), ( "Transition probability in HMM 1 is not between 0.0 and 1.0" + " at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.A[i][j]) ) for j in range(hmm1.M): assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), ( "Observation probability in HMM 1 is not between 0.0 and " + "1.0 at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.B[i][j]) ) for test_rec in self.test_data: [state_seq, seq_prob] = hmm1.viterbi(test_rec) for state in state_seq: assert state in self.states, 'Returned state "' + state + '" not in tate list' assert (seq_prob >= 0.0) and (seq_prob <= 1.0), "Sequence probability is not between 0.0 and 1.0:" + str( seq_prob ) hmm1.train(self.train_data, smoothing="laplace") hmm1.check_prob() hmm1.print_hmm() for i in range(hmm1.N): assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), ( "Initial probability in HMM 1 is not between 0.0 and 1.0 at " + "location [" + str(i) + "]: " + str(hmm1.pi[i]) ) for j in range(hmm1.N): assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), ( "Transition probability in HMM 1 is not between 0.0 and 1.0" + " at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.A[i][j]) ) for j in range(hmm1.M): assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), ( "Observation probability in HMM 1 is not between 0.0 and " + "1.0 at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.B[i][j]) ) for test_rec in self.test_data: [state_seq, seq_prob] = hmm1.viterbi(test_rec) for state in state_seq: assert state in self.states, 'Returned state "' + state + '" not in tate list' assert (seq_prob >= 0.0) and (seq_prob <= 1.0), "Sequence probability is not between 0.0 and 1.0:" + str( seq_prob ) hmm1.train(self.train_data, smoothing="absdiscount") hmm1.check_prob() hmm1.print_hmm() for i in range(hmm1.N): assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), ( "Initial probability in HMM 1 is not between 0.0 and 1.0 at " + "location [" + str(i) + "]: " + str(hmm1.pi[i]) ) for j in range(hmm1.N): assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), ( "Transition probability in HMM 1 is not between 0.0 and 1.0" + " at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.A[i][j]) ) for j in range(hmm1.M): assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), ( "Observation probability in HMM 1 is not between 0.0 and " + "1.0 at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.B[i][j]) ) for test_rec in self.test_data: [state_seq, seq_prob] = hmm1.viterbi(test_rec) for state in state_seq: assert state in self.states, 'Returned state "' + state + '" not in tate list' assert (seq_prob >= 0.0) and (seq_prob <= 1.0), "Sequence probability is not between 0.0 and 1.0:" + str( seq_prob ) hmm1.save_hmm("testhmm.hmm") hmm2 = hmm1 hmm2 = simplehmm.hmm("Test2 HMM", ["dummy"], ["dummy"]) hmm2.load_hmm("testhmm.hmm") assert hmm1.N == hmm2.N, "Loaded HMM has differnt number of states" assert hmm1.M == hmm2.M, "Loaded HMM has differnt number of observations" for i in range(hmm1.N): assert abs(hmm1.pi[i] - hmm2.pi[i]) < self.delta, ( "Initial probability in HMM 1 is different from HMM 2: " + str(hmm1.pi[i]) + " / " + str(hmm2.pi[i]) ) for j in range(hmm1.N): assert abs(hmm1.A[i][j] - hmm2.A[i][j]) < self.delta, ( "Transition probability in HMM 1 is different from HMM 2 " + "at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.A[i][j]) + " / " + str(hmm2.A[i][j]) ) for j in range(hmm1.M): assert abs(hmm1.B[i][j] - hmm1.B[i][j]) < self.delta, ( "Observation probability in HMM 1 is different from HMM 2 " + "at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.B[i][j]) + " / " + str(hmm2.B[i][j]) )
'locql','pc','ter1','ter2','cntr1','cntr2','rubb'] geoloc_hmm_obser = ['PC','N4','NU','AN','TR','CR','LN','ST','IN','IT', \ 'LQ','WT','WN','UT','HY','SL','CO','VB','PA','UN', \ 'RU'] # ============================================================================= # Dictionary of month name abbreviations (used in date.str2date() routine) month_abbrev_dict = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6, \ 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec': 12} # ============================================================================= # If Hidden Markov Model standardisation methods are activate load HMM(s) if (project.name_standard_method == 'hmm'): name_hmm = simplehmm.hmm([], []) # Create new empty HMM object name_hmm.load_hmm(project.name_hmm_file_name) if (project.geoloc_standard_method == 'hmm'): geoloc_hmm = simplehmm.hmm([], []) # Create new empty HMM object geoloc_hmm.load_hmm(project.geoloc_hmm_file_name) # ============================================================================= # List of all supported data file types # # File type names must have a length of 3 characters, or 4 characters if the # file type is quoted (in which case the last character must be a 'Q') # # Currently supported file types are: # CSV - Comma separated values, fields separated by commas # CSVQ - Comma separated values, where each field starts and ends with
def tagdata(): """Main routine, open file, read lines, tag data records, write to out-file. USAGE: tagdata() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 5): print '***** Error: %s needs at least six arguments:'% (sys.argv[0]) print '***** - Name of the project module' print '***** - Tagging mode: "name" or "locality"' print '***** - Output training file name' print '***** - Start of block with training records' print '***** - End of block with training records' print '***** - Number of training records' print '***** plus options' raise Exception() if (config.in_file_name == config.options[2]): print '***** Error: Input and output files must differ' print '***** Input file name: ', config.in_file_name print '***** Output training file name:', config.options[2] raise Exception() first_rec = int(config.options[2]) last_rec = int(config.options[3]) num_rec = int(config.options[4]) in_file_name = config.in_file_name out_file_name = config.options[1] # Check record number values - - - - - - - - - - - - - - - - - - - - - - - - # if (int(first_rec) >= int(last_rec)) or \ ((int(num_rec)-1) > (int(last_rec)-int(first_rec))): print '***** Error: Illegal values for training records block:' print '***** - Start of block with training records:', first_rec print '***** - End of block with training records: ', last_rec print '***** - Number of training records: ', num_rec raise Exception() rec_range = last_rec-first_rec-1 # Range of records in input file # Open input file and check number of available records - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() line_count = 0 for line in f_in.xreadlines(): line_count += 1 f_in.close() if (last_rec > line_count): # Illegal value for last record print '***** Error: Illegal values for last training records:', last_rec print '***** File only contains',line_count, 'lines/records' raise Exception() # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - - # tag_mode = config.options[0] if (tag_mode in ['name','na','n']): tag_mode = 'name' elif (tag_mode in ['locality','localty','loc','l']): tag_mode = 'loc' else: print '***** Error: Illegal tagging mode:', tag_mode print '***** Must be either "name" or "locality"' raise Exception() # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file hmm_file_name = None # Default: Do not use HMM to standardise training # records retag_file_name = None # Default: Do not retag an existing training file config.nowarn = 0 # Deactivate no warning flag (print/log warning # messages) freqs_file_name = None # Default: Do not write frequencies, no -freqs option if (len(config.options) > 5): options = config.options[5:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file,'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file: '+config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write('##################################################') f_log.write('############'+os.linesep) f_log.write('#'+os.linesep) f_log.write("# 'pyTagData.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time())+os.linesep) f_log.write('#'+os.linesep) f_log.write("# Input file name: "+in_file_name+os.linesep) f_log.write("# Output file name: "+out_file_name+os.linesep) f_log.write("# Tagging mode: "+tag_mode+os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-hmm'): hmm_file_name = options[1] # Get file name of the HMM to use if (hmm_file_name == out_file_name): print '***** Error: HMM file name is the same as output file name!' raise Exception() try: f_in = open(hmm_file_name,'r') # Test if file is available except: print '***** Error: Cannot open HMM file specified in "-hmm"', print 'option:', hmm_file_name raise IOError() f_in.close() options = options[2:] # Remove processed '-hmm' option and file name elif (options[0] == '-retag'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-retag" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() retag_file_name = options[1] # Get file name of the already-tagged # file to re-process if (retag_file_name == out_file_name): print '***** Error: Retag file name is the same as output file name!' raise Exception() elif (retag_file_name == in_file_name): print '***** Error: Retag file name is the same as input file name!' raise Exception() elif (retag_file_name == hmm_file_name): print '***** Error: Retag file name is the same as HMM file name!' raise Exception() try: f_in = open(retag_file_name,'r') # Test if file is available # Now gather record numbers and previous tags/states, as well as the # original header information. Use a simple state machine to do this. # tagged_recs = {} cleaned_recs = {} original_header_lines = [] state = -1 # Header lines state prevline = '' for line in f_in.xreadlines(): # Read training file and process it line = line.strip() if (state == -1) and (len(line) == 0): # End of header lines state = 0 prevline = line continue if (state == -1) and (len(line) > 0) and (line[0] == "#"): original_header_lines.append("# " + line) prevline = line continue sline = line.split(' ') if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \ and (sline[2][0] == '(') and (sline[2][-2:] == '):'): try: rec = int(sline[1]) # Original record number tagged_recs[rec] = None cleaned_recs[rec] = None state = 1 except: pass prevline = line continue if (state == 1) and (len(line) > 0) and (line[0] != '#'): tagged_recs[rec] = line cleaned_recs[rec] = prevline state = 0 prevline = line continue if (state == 1) and (len(line) > 0): prevline = line continue f_in.close() tagged_recs_keys = tagged_recs.keys() num_rec = len(tagged_recs_keys) # Override specified numbers first_rec = 0 last_rec = line_count except: print '***** Error: Cannot open tagged training file specified', print 'in "-retag" option:', retag_file_name raise IOError() options = options[2:] # Remove processed '-retag' option and file name elif (options[0][:5] == '-freq'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-feqs" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() freqs_file_name = options[1] # File name to write the frequencies to if (freqs_file_name == out_file_name): print '***** Error: Frequency file name is the same as output', print 'file name!' raise Exception() elif (freqs_file_name == in_file_name): print '***** Error: Frequency file name is the same as input', print 'file name!' raise Exception() elif (freqs_file_name == hmm_file_name): print '***** Error: Frequency file name is the same as HMM', print 'file name!' raise Exception() options = options[2:] # Remove processed '-freqs' option and file name try: # Check if file writing is possible freqs_out = open(freqs_file_name,'w') freqs_out.close() except: print '***** Error: Cannot write to frequency output file specified', print 'in "-freqs" option:', freqs_file_name raise IOError() else: print '***** Error: Illegal option:', options[0] raise Exception() # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - - # if (hmm_file_name != None): myhmm = simplehmm.hmm([],[]) # Create new empty HMM object myhmm.load_hmm(hmm_file_name) myhmm.print_hmm() # Print HMM (according to verbose and logging level) # Open output file and write header - - - - - - - - - - - - - - - - - - - - - # try: f_out = open(out_file_name,'w') except: inout.log_message('Cannot open output file: '+out_file_name,'err') raise IOError() f_out.write("# Tagged training data written by 'pyTagData.py -"+ \ " Version 0.1'"+os.linesep) f_out.write('#'+os.linesep) f_out.write('# Created '+time.ctime(time.time())+os.linesep) f_out.write('#'+os.linesep) f_out.write('# Input file name: '+in_file_name+os.linesep) f_out.write('# Output file name: '+out_file_name+os.linesep) f_out.write('#'+os.linesep) f_out.write('# Parameters:'+os.linesep) f_out.write('# - Start of block with training records: '+str(first_rec)+ \ os.linesep) f_out.write('# - End of block with training records: '+str(last_rec)+ \ os.linesep) f_out.write('# - Number of training records: '+str(num_rec)+ \ os.linesep) if (hmm_file_name != None): f_out.write('#'+os.linesep) f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \ os.linesep) if (retag_file_name != None): f_out.write('#'+os.linesep) f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \ os.linesep) f_out.write("# Header lines from original training file follow:" + \ os.linesep) for header_line in original_header_lines: f_out.write(header_line + os.linesep) if (freqs_file_name != None): f_out.write('#'+os.linesep) f_out.write("# - Tag/state pattern frequencies written to file '" + \ freqs_file_name + os.linesep) f_out.write('#'+'-'*70+os.linesep) f_out.write(os.linesep) rec_count = 0 # Number of selected records num_rec_left = num_rec # Number of records to be selected left rec_selected = {} # Dictionary of all record numbers that were selected seq_freqs = {} # Dict to hold examples of tag/state patterns unchanged_loop_cnt = 0 # Counter of how many loops have been done # without new records being selected prev_num_rec_left = num_rec # Number of records left in the previous # interation # Due to the random nature of selecting records, and because sometimes - - - # a selected component can be empty (and is thus not used for training) # more than one iteration over the input data set is carried out. In each # iteration, records are selected randomly. # while (rec_count < num_rec): # Loop until 'num_rec' records selected # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() line_read = 0 # Number of read lines # Skip to start of training block - - - - - - - - - - - - - - - - - - - - - # if (first_rec > 0): for i in range(first_rec): f_in.readline() while (rec_count < num_rec) and (line_read <= (last_rec-first_rec)): line = f_in.readline() if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \ ((retag_file_name == None) and \ (num_rec_left >= random.randrange(0,rec_range,1))): line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line = line.lower() # Make all characters lower case inout.log_message(['Record number: '+str(line_read+first_rec)],'v1') config.curr_line_no = line_read+first_rec # Store current line number # Process line and extract content into components (name, geocode, etc) # [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \ inout.process_line(line) # Select component and process it - - - - - - - - - - - - - - - - - - - # if (tag_mode == 'name'): if (type(name_comp) == types.ListType): component = name_comp[0].strip()+' '+name_comp[1].strip() else: component = name_comp.strip() else: # Locality component component = geocode_comp.strip()+' '+locality_comp.strip() if (component != '') and \ (not rec_selected.has_key((line_read+first_rec))): if (tag_mode == 'name'): inout.log_message(' Name component: |'+component+'|','v1') component = name.clean_name_component(component) [word_list, tag_list] = name.tag_name_component(component) else: # Locality component inout.log_message(' Locality component: |'+component+'|','v1') component = locality.clean_geoloc_component(component) [word_list, tag_list] = locality.tag_geoloc_component(component) if (tag_list != []): # Only process non-empty tag lists # Append record number into dictionary of processed records # rec_selected.update({(line_read+first_rec):(line_read+first_rec)}) # Create all permutation sequences of this tag list - - - - - - - - # tag_seq = mymath.perm_tag_sequence(tag_list) inout.log_message([' Word list: '+str(word_list), \ ' Tag list: '+str(tag_list), \ ' Tag sequences:'],'v2') # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - - # if (hmm_file_name != None): state_seq = [] # List containing computed HMM state sequences max_prob = -1.0 # maximal probability for a sequence max_seq_no = -1 # Number of the seq. with the max. probablity # Now give tag sequences to the HMMs to compute state sequences # i = 0 for t in tag_seq: [obs_seq, prob] = myhmm.viterbi(t) state_seq.append(obs_seq) if (prob > max_prob): max_prob = prob max_seq_no = i i += 1 # Write original component and resulting tag sequences to output # f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \ '): |'+component+'|'+os.linesep) # Commented original num_len = len(str(line_read+first_rec))+len(str(rec_count))+6 f_out.write('#'+num_len*' '+'|'+' '.join(word_list)+'|'+os.linesep) for i in range(len(tag_seq)): # Convert each tag sequence into a string for file output # seq_string = ' ' if (hmm_file_name != None) and (i != max_seq_no): seq_string = '# ' # Comment sequences with not max. probability for j in range(len(tag_seq[i])): if (hmm_file_name != None): seq_string = seq_string+' '+tag_seq[i][j]+':'+ \ state_seq[i][j]+',' else: seq_string = seq_string+' '+tag_seq[i][j]+':,' f_out.write(seq_string[:-1]+os.linesep) # Write without , at end inout.log_message(' '+seq_string[:-1],'v2') if (hmm_file_name != None): f_out.write('# Maximum Viterbi probability: %0.5f'% \ (max_prob) + os.linesep) inout.log_message('Maximum Viterbi probability: %0.5f'% \ (max_prob), 'v2') if (retag_file_name != None) and (tagged_recs[line_read] != None): if (tagged_recs[line_read].strip() != seq_string[:-1].strip()): f_out.write("# Note: ***** Changed *****" + os.linesep) inout.log_message(' Note:' + \ ' ***** Changed *****','v2') f_out.write('# Was: ' + tagged_recs[line_read]+os.linesep) # Write commented original tag sequence inout.log_message('Original tag sequence: '+ \ tagged_recs[line_read],'v2') f_out.write(os.linesep) # Write an empty line inout.log_message('','v1') # Print empty lines between records if (hmm_file_name != None): seq_key = seq_string[:-1] # Add sequence to dictionary if (seq_freqs.has_key(seq_key)): seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \ max_prob]) else: seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \ max_prob]] rec_count += 1 # Print process indicator message # if (config.proc_ind >= 0) and (rec_count > 0): if (rec_count % config.proc_ind == 0): print 'Processed line', rec_count, 'of', num_rec line_read += 1 f_in.close() num_rec_left = num_rec - rec_count if (prev_num_rec_left == num_rec_left): # No new records selected unchanged_loop_cnt += 1 prev_num_rec_left = num_rec_left # Set to current value if (unchanged_loop_cnt > 5): # Do five loops maximal without selecting # new records config.curr_line_no = -1 # Set to illegal/empty values, as warning is config.curr_line = '' # not related to the current input line inout.log_message(['Can not select more than '+str(rec_count)+ \ ' records for training.', \ 'This is probably due to empty input components.', \ 'Please reduce value of "num_rec" or increase ' + \ 'range','between "first_rec" and "last_rec".'],'warn') break if (num_rec_left < 10): # Only 10 records left to select num_rec_left = num_rec+1 # Set to more than 100% probablity elif (num_rec_left < (num_rec / 100.0)): # Less than 1% records left num_rec_left = int(num_rec / 100.0) # Set to 1% f_out.close() # If specified, save Viterbi frequencies to a file - - - - - - - - - - - - - # if (freqs_file_name != None): freqs_out = open(freqs_file_name,'w') # Open frequency file for writing freqs_out.write('# Frequency listing of tag/state patterns written by') freqs_out.write('"pyTagData.py - Version 0.1"'+os.linesep) freqs_out.write('#'+os.linesep) freqs_out.write('# Created '+time.ctime(time.time())+os.linesep) freqs_out.write('#'+os.linesep) freqs_out.write("# Input file name: "+in_file_name+os.linesep) freqs_out.write("# Output file name: "+out_file_name+os.linesep) freqs_out.write(os.linesep) freqs_out.write('# Parameters:'+os.linesep) freqs_out.write('# - Start of block with training records: '+ \ str(first_rec)+os.linesep) freqs_out.write('# - End of block with training records: '+ \ str(last_rec)+os.linesep) freqs_out.write('# - Number of training records: '+ \ str(num_rec)+os.linesep) if (hmm_file_name != None): freqs_out.write('#'+os.linesep) freqs_out.write("# - Using HMM file '"+hmm_file_name+ \ "' for standardisation"+os.linesep) if (retag_file_name != None): freqs_out.write('#'+os.linesep) freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \ "'"+os.linesep) freqs_out.write('#'+'-'*70+os.linesep) freqs_out.write(os.linesep) sorted_seq_freqs = [] # Now sort sequences according to their fruequencies for key in seq_freqs.keys(): sorted_seq_freqs.append((len(seq_freqs[key]),key)) sorted_seq_freqs.sort() for skey in sorted_seq_freqs: key = skey[1] freqs_out.write('# Pattern: '+str(key)+os.linesep) freqs_out.write('# Frequency: '+str(skey[0])+os.linesep) examples = seq_freqs[key] freqs_out.write('# Maximum Viterbi probability: '+ \ str(examples[0][1])+os.linesep) freqs_out.write('# Examples: '+os.linesep) for example in examples: freqs_out.write('# '+str(example[0])+os.linesep) freqs_out.write(str(key)+os.linesep) freqs_out.write(os.linesep) freqs_out.close() inout.log_message(['Read '+str(line_read)+' lines, processed '+ \ str(rec_count)+' lines', 'End.'],'v1')
def standard(): """Main routine, open file, read lines, standardise them and write into file. USAGE: standard() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 2): print '***** Error: %s needs at least three arguments:'% (sys.argv[0]) print '***** - Name of the project module' print '***** - Number of the first record to be processed' print '***** - Number of records to be processed' print '***** plus options' raise Exception() first_rec = int(config.options[0]) num_rec = int(config.options[1]) in_file_name = config.in_file_name out_file_name = config.out_file_name # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file write_header = 0 # Write header (output field names) to output file # (default: Don't) config.nowarn = 0 # Deactivate no warning flag (print/log warning messages) if (len(config.options) > 2): options = config.options[2:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file,'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file:', config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write('##################################################') f_log.write("############"+os.linesep) f_log.write("#"+os.linesep) f_log.write("# 'pyStandard.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time())+os.linesep) f_log.write("#"+os.linesep) f_log.write("# Input file name: "+in_file_name+os.linesep) f_log.write("# Output file name: "+out_file_name+os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-h'): write_header = 1 options = options[1:] # Remove processed -'h' option elif (options[0] == '-hmm-name'): hmm_name_file = options[1] # Get file name of the name HMM to use try: f_in = open(hmm_name_file,'r') # Test if file is available except: print '***** Error ********************', print '***** Cannot open HMM file in "-hmm-name" option:', print hmm_name_file raise IOError() f_in.close() options = options[2:] # Remove processed option and file name config.name_standard_method = 'hmm' config.name_hmm_file_name = hmm_name_file config.name_hmm = simplehmm.hmm([],[]) # Create new empty HMM object config.name_hmm.load_hmm(config.name_hmm_file_name) elif (options[0] == '-hmm-loc'): hmm_loc_file = options[1] # Get file name of the locality HMM to use try: f_in = open(hmm_loc_file,'r') # Test if file is available except: print '***** Error ********************', print '***** Cannot open HMM file in "-hmm-loc" option:', print hmm_loc_file raise IOError() f_in.close() options = options[2:] # Remove processed option and file name config.geoloc_standard_method == 'hmm' config.geoloc_hmm_file_name = hmm_loc_file config.geoloc_hmm = simplehmm.hmm([],[]) # Create new HMM object config.geoloc_hmm.load_hmm(config.geoloc_hmm_file_name) else: print '***** Error: Illegal option:', options[0] raise Exception() # Open input file and check number of available records - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() line_count = 0 for line in f_in.xreadlines(): line_count += 1 f_in.close() if ((first_rec+num_rec) > line_count): # Illegal value for last record print '***** Error: Illegal values for number of records to process:', print num__rec, ', with start record:', start_rec print '***** File only contains',line_count, 'lines/records' raise Exception() # Open files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() try: f_out = open(out_file_name,'w') except: inout.log_message('Cannot open output file: '+out_file_name,'err') raise IOError() # Write header (name of output fields) into output file - - - - - - - - - - - # if (write_header == 1): header_dict = {} for n in config.output_field_names: header_dict.update({n:n}) # Dictionary where values are field names header_line = inout.compose_line(header_dict,header=1) f_out.write(header_line+os.linesep) # Skip over records - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # if (first_rec > 0): for i in range(first_rec): f_in.readline() # Read lines, process them and write into output files - - - - - - - - - - - # line_read = 0 # Number of read lines while (line_read < num_rec): # Loop until 'num_rec' records processed line = f_in.readline() # Print process indicator message # if (config.proc_ind >= 0) and (line_read > 0): # Only print if activated if (line_read % config.proc_ind == 0): print 'Processed line', line_read, 'of', num_rec line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line = line.lower() # Make all characters lower case inout.log_message(['Record '+str(line_read+first_rec)],'v1') config.curr_line_no = line_read+first_rec # Store current line number # Process line and extract content into components (name, geocode, etc.) # [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \ inout.process_line(line) # Make a local empty working copy of the output field dictionary - - - - - # output_fields = config.output_field.copy() output_fields_keys = output_fields.keys() for k in output_fields_keys: output_fields[k] = '' # Set all fields to an empty string # Standardise name component - - - - - - - - - - - - - - - - - - - - - - - # if (type(name_comp) == types.ListType): # Givenname and surname separate givenname_comp = name_comp[0].strip() surname_comp = name_comp[1].strip() if (givenname_comp != ''): # There is a givenname - - - - - - - - - - - inout.log_message(' Givenname component: |'+givenname_comp+'|','v1') givenname_comp = name.clean_name_component(givenname_comp) [name_list, tag_list] = name.tag_name_component(givenname_comp) output_fields['gender_guess'] = name.get_gender_guess(name_list, \ tag_list) [name_list, tag_list, output_fields['title']] = \ name.get_title(name_list, tag_list) [output_fields['givenname'], output_fields['alt_givenname']] = \ name.get_name_component(name_list, tag_list, 'gname') if (surname_comp != ''): # There is a surname - - - - - - - - - - - - - inout.log_message(' Surname component: |'+surname_comp+'|','v1') surname_comp = name.clean_name_component(surname_comp) [name_list, tag_list] = name.tag_name_component(surname_comp) [output_fields['surname'], output_fields['alt_surname']] = \ name.get_name_component(name_list, tag_list, 'sname') elif (name_comp.strip() != ''): # Given- and surname both in one field - - inout.log_message(' Name component: |'+name_comp+'|','v1') name_comp = name.clean_name_component(name_comp) [name_list, tag_list] = name.tag_name_component(name_comp) output_fields['gender_guess'] = name.get_gender_guess(name_list,tag_list) [name_list, tag_list, output_fields['title']] = \ name.get_title(name_list, tag_list) if (config.name_standard_method == 'rules'): name_dict = name.get_names_rules(name_list, tag_list, 'gname') elif (config.name_standard_method == 'hmm'): name_dict = name.get_names_hmm(name_list, tag_list) else: inout.log_message('Illegal name standardisation method:'+ \ config.name_standard_method,'err') raise Exception() for (field,value) in name_dict.items(): # Assign to output dictionary output_fields[field] = value # Standardise geocode and locality components using HMM - - - - - - - - - - # if (config.geoloc_standard_method == 'hmm') and \ ((geocode_comp.strip() != '') or (locality_comp.strip() != '')): geoloc_comp = geocode_comp.strip()+' '+locality_comp.strip() inout.log_message(' Geocode and locality component: |'+geoloc_comp+'|',\ 'v1') geoloc_comp = locality.clean_geoloc_component(geoloc_comp) [geoloc_words, geoloc_tags] = locality.tag_geoloc_component(geoloc_comp) if (geoloc_words != []): # Component not empty, do HMM standardisation geoloc_dict = locality.get_geoloc_hmm(geoloc_words,geoloc_tags) for (field,value) in geoloc_dict.items(): # Assign to output dictionary output_fields[field] = value # Standardise geocode component using rules - - - - - - - - - - - - - - - - # elif (config.geoloc_standard_method == 'rules') and \ (geocode_comp.strip() != ''): inout.log_message(' Geocode component: |'+geocode_comp+'|','v1') ### TO BE DONE inout.log_message('Rules based standardisation for geocode is' + \ 'not implemented yet','err') raise Exception() # Standardise locality component using rules - - - - - - - - - - - - - - - # elif (config.geoloc_standard_method == 'rules') and \ (locality_comp.strip() != ''): inout.log_message(' Locality component: |'+locality_comp+'|','v1') ### TO BE FINALISED inout.log_message('Rules based standardisation for locality is' + \ 'not implemented yet','err') raise Exception() # locality_comp = locality.clean_geoloc_component(locality_comp) # [loc_words, loc_tags] = locality.tag_geoloc_component(locality_comp) # # [terr,loc_words2,loc_tags2] = locality.get_territory(loc_words,loc_tags) # if (terr != ''): # output_fields['territory'] = terr # # [pc,loc_words3,loc_tags3] = locality.get_postcode(loc_words2,loc_tags2) # if (pc != ''): # output_fields['postcode'] = pc # # [loc_name, loc_quali, loc_words4, loc_tags4] = \ # locality.get_localityname_qualifier(loc_words3, loc_tags3) # if (loc_name != ''): # output_fields['locality_name'] = loc_name # if (loc_quali != ''): # output_fields['locality_quali'] = loc_quali # # if (loc_words4 != []): # Not all words are standardised yet # print ' # Remaining word list:', loc_words4 ###### TEST # print ' # Remaining tag list: ', loc_tags4 ###### TEST # Standardise date strings - - - - - - - - - - - - - - - - - - - - - - - - # if (date1_comp != ''): inout.log_message(' Date1 component: |'+date1_comp+'|','v1') [day1,month1,year1,status1] = date.parse_datestr(date1_comp) if (day1 != -1): output_fields['day1'] = str(day1) if (month1 != -1): output_fields['month1'] = str(month1) if (year1 != -1): output_fields['year1'] = str(year1) if (date2_comp != ''): inout.log_message(' Date2 component: |'+date2_comp+'|','v1') [day2,month2,year2,status2] = date.parse_datestr(date2_comp) if (day2 != -1): output_fields['day2'] = str(day2) if (month2 != -1): output_fields['month2'] = str(month2) if (year2 != -1): output_fields['year2'] = str(year2) # Create log message of output fields - - - - - - - - - - - - - - - - - - - # msg = [' Standardised record output fields:'] for (field,value) in output_fields.items(): if (value != '') and (value != []): msg.append(' '+field+':'+str(value)) inout.log_message(msg,'v1') # Save standardised record into output field # out_line = inout.compose_line(output_fields) f_out.write(out_line+os.linesep) # Increment line counter and go to beginning of loop - - - - - - - - - - - # line_read += 1 inout.log_message('','v1') # Print empty lines between records # Close files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # f_in.close() f_out.close() msg = ['','Number of warnings: '+str(config.num_warning), \ 'Number of corrected word spillings: '+str(config.num_word_spills)] inout.log_message(msg,'v1') print msg[1] print msg[2] inout.log_message('End.','v1')
[('title', 'TI'), ('givenname', 'SN'), ('surname', 'SN')], [('givenname', 'GM'), ('surname', 'SN')], [('title', 'TI'), ('givenname', 'GF'), ('surname', 'SN')], [('title', 'TI'), ('surname', 'SN'), ('givenname', 'GM')], [('surname', 'UN'), ('givenname', 'UN')], [('givenname', 'GF'), ('surname', 'GF'), ('surname', 'SN')]] # Some test examples (observation (tag) sequences), one per line test_data = [['TI', 'GM', 'SN'], ['UN', 'SN'], ['TI', 'UN', 'UN'], ['TI', 'GF', 'UN'], ['UN', 'UN', 'UN', 'UN'], ['TI', 'GM', 'UN', 'SN'], ['GF', 'UN']] # Initialise a new HMM and train it test_hmm = simplehmm.hmm('Test HMM', test_hmm_states, test_hmm_observ) test_hmm.train(train_data) # Train the HMM test_hmm.check_prob() # Check its probabilities test_hmm.print_hmm() # Print it out # Apply the Viterbi algorithm to each sequence of the test data for test_rec in test_data: [state_sequence, sequence_probability] = test_hmm.viterbi(test_rec) # Initialise and train a second HMM using the same training data and # applying Laplace smoothing test_hmm2 = simplehmm.hmm('Test HMM 2', test_states, test_observ) test_hmm2.train(train_data, smoothing='laplace')
import math import simplehmm lnc_hmm = simplehmm.hmm('LncRNA', ['dummy'], ['dummy']) lnc_hmm.load_hmm('hmm_treinamento_lnc_sem_sorter_threshold.hmm') lnc_hmm.print_hmm() # Print it out print math.log(10)
import simplehmm hmm_teste = simplehmm.hmm("mrna",['nada'],['nada']) hmm_teste.load_hmm('mRNAsTRAIN.hmm') hmm_teste.print_hmm() hmm_teste.load_hmm('randomseq.hmm') hmm_teste.print_hmm()
'''TREINAMENTO ''' #cirRNA_hmm= simplehmm.hmm('cirRNA_hmm_primeiro_teste',estados,emissoes) #cirRNA_hmm.train(entrada_treinamento, smoothing='absdiscount') #cirRNA_hmm.save_hmm("circ.hmm") #print cirRNA_hmm.print_hmm() '''VALIDACAO''' cirRNA_hmm=simplehmm.hmm('circ_rna primeiros testes',['dummy'], ['dummy']) cirRNA_hmm.load_hmm('circ.hmm') #print query[1] #print cirRNA_hmm.print_hmm() print len(query[0]),'size' for query_out in query: print query_out print cirRNA_hmm.viterbi(query[0])[1] print "------------------------------------------"
'locql','pc','ter1','ter2','cntr1','cntr2','rubb'] geoloc_hmm_obser = ['PC','N4','NU','AN','TR','CR','LN','ST','IN','IT', \ 'LQ','WT','WN','UT','HY','SL','CO','VB','PA','UN', \ 'RU'] # ============================================================================= # Dictionary of month name abbreviations (used in date.str2date() routine) month_abbrev_dict = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6, \ 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec': 12} # ============================================================================= # If Hidden Markov Model standardisation methods are activate load HMM(s) if (project.name_standard_method == 'hmm'): name_hmm = simplehmm.hmm([],[]) # Create new empty HMM object name_hmm.load_hmm(project.name_hmm_file_name) if (project.geoloc_standard_method == 'hmm'): geoloc_hmm = simplehmm.hmm([],[]) # Create new empty HMM object geoloc_hmm.load_hmm(project.geoloc_hmm_file_name) # ============================================================================= # List of all supported data file types # # File type names must have a length of 3 characters, or 4 characters if the # file type is quoted (in which case the last character must be a 'Q') # # Currently supported file types are: # CSV - Comma separated values, fields separated by commas # CSVQ - Comma separated values, where each field starts and ends with
def trainhmm(): """Main routine, open file, read lines, train HMM and save it to file. USAGE: trainhmm() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 3): print '***** Error: %s needs at least four arguments:' % (sys.argv[0]) print '***** - Name of the project module' print '***** - Tagging mode: "name" or "locality"' print '***** - Input training file name' print '***** - HMM output file name' print '***** plus options' raise Exception() if (config.options[1] == config.options[2]): print '*** Error: Input and output files must differ' print '*** Input training file name:', config.options[1] print '*** HMM output file name: ', config.options[1] raise Exception() in_file_name = config.options[1] hmm_file_name = config.options[2] # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - - # tag_mode = config.options[0] if (tag_mode in ['name', 'na', 'n']): tag_mode = 'name' elif (tag_mode in ['locality', 'lolty', 'loc', 'l']): tag_mode = 'loc' else: print '***** Error: Illegal tagging mode:', tag_mode print '***** Must be either "name" or "locality"' raise Exception() # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file smoothing = None # Default: No smoothing config.nowarn = 0 # Deactivate no warning flag (print/log warning # messages) if (len(config.options) > 3): options = config.options[3:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file, 'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file: ' + config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write( '##################################################') f_log.write("############" + os.linesep) f_log.write("#" + os.linesep) f_log.write( "# 'pyTrainHMM.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time()) + os.linesep) f_log.write("#" + os.linesep) f_log.write("# Input file name: " + in_file_name + os.linesep) f_log.write("# HMM file name: " + hmm_file_name + os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-s'): smoothing = 1 # Set to do a HMM smoothing smoothing = options[1] if (smoothing in ['l', 'la', 'lap', 'laplac', 'laplace']): smoothing = 'laplace' elif (smoothing in ['a','ad','abs','absd','absdis','absdisc',\ 'absdiscount']): smoothing = 'absdiscount' else: # Illegal value print "*** Error: Illegal value for 'smoothing' argument:", smoothing print "*** Possible are: 'laplace' or 'absdiscount'" raise Exception() options = options[2:] # Remove processed option else: print '*** Error: Illegal option:', options[0] raise Exception() # Get HMM states and observations from configuration module - - - - - - - - - # if (tag_mode == 'name'): state_list = config.name_hmm_states obser_list = config.name_hmm_obser else: state_list = config.geoloc_hmm_states obser_list = config.geoloc_hmm_obser # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() line_count = 0 # Counter for lines read rec_count = 0 # Counter for training records read # Read lines, discard comment lines and process training data lines - - - - - # training_data = [] # List of training records train_list = [ ] # List of training sequences (dictionaries), extracted from # training data for line in xreadlines.xreadlines(f_in): if (line[0] != '#') and (line.strip() != ''): # Line must contain a training record line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line_list = line.split(',') # Split into a list of elements line_data = [] # Training data list for one training record inout.log_message(['Record number: ' + str(rec_count)], 'v1') config.curr_line_no = line_count # Store current line number for elem in line_list: [k, v] = elem.split(':') # Split into key and value tag = k.strip() state = v.strip() line_data.append((state, tag)) if (state not in state_list): msg = ['Illegal state name in training record: '+state, \ 'Line: '+str(line_count)+', record: '+str(rec_count), \ 'Possible values: '+str(state_list)] inout.log_message(msg, 'err') raise Exception() if (tag not in obser_list): msg = ['Illegal observation (tag) name in training record: '+tag, \ 'Line: '+str(line_count)+', record: '+str(rec_count), \ 'Possible values: '+str(obser_list)] inout.log_message(msg, 'err') raise Exception() inout.log_message(' Training record '+str(rec_count)+':'+ \ str(line_data),'v1') train_list.append(line_data) rec_count += 1 inout.log_message('', 'v1') # Print empty lines between records line_count += 1 # Close input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # f_in.close() inout.log_message('', 'v1') # Print empty lines between records # Initalise HMM and train it with training data - - - - - - - - - - - - - - - # myhmm = simplehmm.hmm(state_list, obser_list) myhmm.train(train_list, smoothing) myhmm.print_hmm() # Save trained HMM - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # myhmm.save_hmm(hmm_file_name) inout.log_message(['Read '+str(line_count)+' lines, processed '+ \ str(rec_count)+' training records', 'End.'],'v1')
def tagdata(): """Main routine, open file, read lines, tag data records, write to out-file. USAGE: tagdata() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 5): print '***** Error: %s needs at least six arguments:' % (sys.argv[0]) print '***** - Name of the project module' print '***** - Tagging mode: "name" or "locality"' print '***** - Output training file name' print '***** - Start of block with training records' print '***** - End of block with training records' print '***** - Number of training records' print '***** plus options' raise Exception() if (config.in_file_name == config.options[2]): print '***** Error: Input and output files must differ' print '***** Input file name: ', config.in_file_name print '***** Output training file name:', config.options[2] raise Exception() first_rec = int(config.options[2]) last_rec = int(config.options[3]) num_rec = int(config.options[4]) in_file_name = config.in_file_name out_file_name = config.options[1] # Check record number values - - - - - - - - - - - - - - - - - - - - - - - - # if (int(first_rec) >= int(last_rec)) or \ ((int(num_rec)-1) > (int(last_rec)-int(first_rec))): print '***** Error: Illegal values for training records block:' print '***** - Start of block with training records:', first_rec print '***** - End of block with training records: ', last_rec print '***** - Number of training records: ', num_rec raise Exception() rec_range = last_rec - first_rec - 1 # Range of records in input file # Open input file and check number of available records - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() line_count = 0 for line in f_in.xreadlines(): line_count += 1 f_in.close() if (last_rec > line_count): # Illegal value for last record print '***** Error: Illegal values for last training records:', last_rec print '***** File only contains', line_count, 'lines/records' raise Exception() # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - - # tag_mode = config.options[0] if (tag_mode in ['name', 'na', 'n']): tag_mode = 'name' elif (tag_mode in ['locality', 'localty', 'loc', 'l']): tag_mode = 'loc' else: print '***** Error: Illegal tagging mode:', tag_mode print '***** Must be either "name" or "locality"' raise Exception() # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file hmm_file_name = None # Default: Do not use HMM to standardise training # records retag_file_name = None # Default: Do not retag an existing training file config.nowarn = 0 # Deactivate no warning flag (print/log warning # messages) freqs_file_name = None # Default: Do not write frequencies, no -freqs option if (len(config.options) > 5): options = config.options[5:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file, 'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file: ' + config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write( '##################################################') f_log.write('############' + os.linesep) f_log.write('#' + os.linesep) f_log.write( "# 'pyTagData.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time()) + os.linesep) f_log.write('#' + os.linesep) f_log.write("# Input file name: " + in_file_name + os.linesep) f_log.write("# Output file name: " + out_file_name + os.linesep) f_log.write("# Tagging mode: " + tag_mode + os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-hmm'): hmm_file_name = options[1] # Get file name of the HMM to use if (hmm_file_name == out_file_name): print '***** Error: HMM file name is the same as output file name!' raise Exception() try: f_in = open(hmm_file_name, 'r') # Test if file is available except: print '***** Error: Cannot open HMM file specified in "-hmm"', print 'option:', hmm_file_name raise IOError() f_in.close() options = options[ 2:] # Remove processed '-hmm' option and file name elif (options[0] == '-retag'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-retag" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() retag_file_name = options[ 1] # Get file name of the already-tagged # file to re-process if (retag_file_name == out_file_name): print '***** Error: Retag file name is the same as output file name!' raise Exception() elif (retag_file_name == in_file_name): print '***** Error: Retag file name is the same as input file name!' raise Exception() elif (retag_file_name == hmm_file_name): print '***** Error: Retag file name is the same as HMM file name!' raise Exception() try: f_in = open(retag_file_name, 'r') # Test if file is available # Now gather record numbers and previous tags/states, as well as the # original header information. Use a simple state machine to do this. # tagged_recs = {} cleaned_recs = {} original_header_lines = [] state = -1 # Header lines state prevline = '' for line in f_in.xreadlines( ): # Read training file and process it line = line.strip() if (state == -1) and (len(line) == 0): # End of header lines state = 0 prevline = line continue if (state == -1) and (len(line) > 0) and (line[0] == "#"): original_header_lines.append("# " + line) prevline = line continue sline = line.split(' ') if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \ and (sline[2][0] == '(') and (sline[2][-2:] == '):'): try: rec = int(sline[1]) # Original record number tagged_recs[rec] = None cleaned_recs[rec] = None state = 1 except: pass prevline = line continue if (state == 1) and (len(line) > 0) and (line[0] != '#'): tagged_recs[rec] = line cleaned_recs[rec] = prevline state = 0 prevline = line continue if (state == 1) and (len(line) > 0): prevline = line continue f_in.close() tagged_recs_keys = tagged_recs.keys() num_rec = len( tagged_recs_keys) # Override specified numbers first_rec = 0 last_rec = line_count except: print '***** Error: Cannot open tagged training file specified', print 'in "-retag" option:', retag_file_name raise IOError() options = options[ 2:] # Remove processed '-retag' option and file name elif (options[0][:5] == '-freq'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-feqs" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() freqs_file_name = options[ 1] # File name to write the frequencies to if (freqs_file_name == out_file_name): print '***** Error: Frequency file name is the same as output', print 'file name!' raise Exception() elif (freqs_file_name == in_file_name): print '***** Error: Frequency file name is the same as input', print 'file name!' raise Exception() elif (freqs_file_name == hmm_file_name): print '***** Error: Frequency file name is the same as HMM', print 'file name!' raise Exception() options = options[ 2:] # Remove processed '-freqs' option and file name try: # Check if file writing is possible freqs_out = open(freqs_file_name, 'w') freqs_out.close() except: print '***** Error: Cannot write to frequency output file specified', print 'in "-freqs" option:', freqs_file_name raise IOError() else: print '***** Error: Illegal option:', options[0] raise Exception() # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - - # if (hmm_file_name != None): myhmm = simplehmm.hmm([], []) # Create new empty HMM object myhmm.load_hmm(hmm_file_name) myhmm.print_hmm() # Print HMM (according to verbose and logging level) # Open output file and write header - - - - - - - - - - - - - - - - - - - - - # try: f_out = open(out_file_name, 'w') except: inout.log_message('Cannot open output file: ' + out_file_name, 'err') raise IOError() f_out.write("# Tagged training data written by 'pyTagData.py -"+ \ " Version 0.1'"+os.linesep) f_out.write('#' + os.linesep) f_out.write('# Created ' + time.ctime(time.time()) + os.linesep) f_out.write('#' + os.linesep) f_out.write('# Input file name: ' + in_file_name + os.linesep) f_out.write('# Output file name: ' + out_file_name + os.linesep) f_out.write('#' + os.linesep) f_out.write('# Parameters:' + os.linesep) f_out.write('# - Start of block with training records: '+str(first_rec)+ \ os.linesep) f_out.write('# - End of block with training records: '+str(last_rec)+ \ os.linesep) f_out.write('# - Number of training records: '+str(num_rec)+ \ os.linesep) if (hmm_file_name != None): f_out.write('#' + os.linesep) f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \ os.linesep) if (retag_file_name != None): f_out.write('#' + os.linesep) f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \ os.linesep) f_out.write("# Header lines from original training file follow:" + \ os.linesep) for header_line in original_header_lines: f_out.write(header_line + os.linesep) if (freqs_file_name != None): f_out.write('#' + os.linesep) f_out.write("# - Tag/state pattern frequencies written to file '" + \ freqs_file_name + os.linesep) f_out.write('#' + '-' * 70 + os.linesep) f_out.write(os.linesep) rec_count = 0 # Number of selected records num_rec_left = num_rec # Number of records to be selected left rec_selected = {} # Dictionary of all record numbers that were selected seq_freqs = {} # Dict to hold examples of tag/state patterns unchanged_loop_cnt = 0 # Counter of how many loops have been done # without new records being selected prev_num_rec_left = num_rec # Number of records left in the previous # interation # Due to the random nature of selecting records, and because sometimes - - - # a selected component can be empty (and is thus not used for training) # more than one iteration over the input data set is carried out. In each # iteration, records are selected randomly. # while (rec_count < num_rec): # Loop until 'num_rec' records selected # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() line_read = 0 # Number of read lines # Skip to start of training block - - - - - - - - - - - - - - - - - - - - - # if (first_rec > 0): for i in range(first_rec): f_in.readline() while (rec_count < num_rec) and (line_read <= (last_rec - first_rec)): line = f_in.readline() if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \ ((retag_file_name == None) and \ (num_rec_left >= random.randrange(0,rec_range,1))): line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line = line.lower() # Make all characters lower case inout.log_message( ['Record number: ' + str(line_read + first_rec)], 'v1') config.curr_line_no = line_read + first_rec # Store current line number # Process line and extract content into components (name, geocode, etc) # [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \ inout.process_line(line) # Select component and process it - - - - - - - - - - - - - - - - - - - # if (tag_mode == 'name'): if (type(name_comp) == types.ListType): component = name_comp[0].strip( ) + ' ' + name_comp[1].strip() else: component = name_comp.strip() else: # Locality component component = geocode_comp.strip( ) + ' ' + locality_comp.strip() if (component != '') and \ (not rec_selected.has_key((line_read+first_rec))): if (tag_mode == 'name'): inout.log_message( ' Name component: |' + component + '|', 'v1') component = name.clean_name_component(component) [word_list, tag_list] = name.tag_name_component(component) else: # Locality component inout.log_message( ' Locality component: |' + component + '|', 'v1') component = locality.clean_geoloc_component(component) [word_list, tag_list] = locality.tag_geoloc_component(component) if (tag_list != []): # Only process non-empty tag lists # Append record number into dictionary of processed records # rec_selected.update({ (line_read + first_rec): (line_read + first_rec) }) # Create all permutation sequences of this tag list - - - - - - - - # tag_seq = mymath.perm_tag_sequence(tag_list) inout.log_message([' Word list: '+str(word_list), \ ' Tag list: '+str(tag_list), \ ' Tag sequences:'],'v2') # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - - # if (hmm_file_name != None): state_seq = [ ] # List containing computed HMM state sequences max_prob = -1.0 # maximal probability for a sequence max_seq_no = -1 # Number of the seq. with the max. probablity # Now give tag sequences to the HMMs to compute state sequences # i = 0 for t in tag_seq: [obs_seq, prob] = myhmm.viterbi(t) state_seq.append(obs_seq) if (prob > max_prob): max_prob = prob max_seq_no = i i += 1 # Write original component and resulting tag sequences to output # f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \ '): |'+component+'|'+os.linesep) # Commented original num_len = len(str(line_read + first_rec)) + len( str(rec_count)) + 6 f_out.write('#' + num_len * ' ' + '|' + ' '.join(word_list) + '|' + os.linesep) for i in range(len(tag_seq)): # Convert each tag sequence into a string for file output # seq_string = ' ' if (hmm_file_name != None) and (i != max_seq_no): seq_string = '# ' # Comment sequences with not max. probability for j in range(len(tag_seq[i])): if (hmm_file_name != None): seq_string = seq_string+' '+tag_seq[i][j]+':'+ \ state_seq[i][j]+',' else: seq_string = seq_string + ' ' + tag_seq[i][ j] + ':,' f_out.write(seq_string[:-1] + os.linesep) # Write without , at end inout.log_message(' ' + seq_string[:-1], 'v2') if (hmm_file_name != None): f_out.write('# Maximum Viterbi probability: %0.5f'% \ (max_prob) + os.linesep) inout.log_message('Maximum Viterbi probability: %0.5f'% \ (max_prob), 'v2') if (retag_file_name != None) and (tagged_recs[line_read] != None): if (tagged_recs[line_read].strip() != seq_string[:-1].strip()): f_out.write("# Note: ***** Changed *****" + os.linesep) inout.log_message(' Note:' + \ ' ***** Changed *****','v2') f_out.write('# Was: ' + tagged_recs[line_read] + os.linesep) # Write commented original tag sequence inout.log_message('Original tag sequence: '+ \ tagged_recs[line_read],'v2') f_out.write(os.linesep) # Write an empty line inout.log_message( '', 'v1') # Print empty lines between records if (hmm_file_name != None): seq_key = seq_string[: -1] # Add sequence to dictionary if (seq_freqs.has_key(seq_key)): seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \ max_prob]) else: seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \ max_prob]] rec_count += 1 # Print process indicator message # if (config.proc_ind >= 0) and (rec_count > 0): if (rec_count % config.proc_ind == 0): print 'Processed line', rec_count, 'of', num_rec line_read += 1 f_in.close() num_rec_left = num_rec - rec_count if (prev_num_rec_left == num_rec_left): # No new records selected unchanged_loop_cnt += 1 prev_num_rec_left = num_rec_left # Set to current value if (unchanged_loop_cnt > 5): # Do five loops maximal without selecting # new records config.curr_line_no = -1 # Set to illegal/empty values, as warning is config.curr_line = '' # not related to the current input line inout.log_message(['Can not select more than '+str(rec_count)+ \ ' records for training.', \ 'This is probably due to empty input components.', \ 'Please reduce value of "num_rec" or increase ' + \ 'range','between "first_rec" and "last_rec".'],'warn') break if (num_rec_left < 10): # Only 10 records left to select num_rec_left = num_rec + 1 # Set to more than 100% probablity elif (num_rec_left < (num_rec / 100.0)): # Less than 1% records left num_rec_left = int(num_rec / 100.0) # Set to 1% f_out.close() # If specified, save Viterbi frequencies to a file - - - - - - - - - - - - - # if (freqs_file_name != None): freqs_out = open(freqs_file_name, 'w') # Open frequency file for writing freqs_out.write('# Frequency listing of tag/state patterns written by') freqs_out.write('"pyTagData.py - Version 0.1"' + os.linesep) freqs_out.write('#' + os.linesep) freqs_out.write('# Created ' + time.ctime(time.time()) + os.linesep) freqs_out.write('#' + os.linesep) freqs_out.write("# Input file name: " + in_file_name + os.linesep) freqs_out.write("# Output file name: " + out_file_name + os.linesep) freqs_out.write(os.linesep) freqs_out.write('# Parameters:' + os.linesep) freqs_out.write('# - Start of block with training records: '+ \ str(first_rec)+os.linesep) freqs_out.write('# - End of block with training records: '+ \ str(last_rec)+os.linesep) freqs_out.write('# - Number of training records: '+ \ str(num_rec)+os.linesep) if (hmm_file_name != None): freqs_out.write('#' + os.linesep) freqs_out.write("# - Using HMM file '"+hmm_file_name+ \ "' for standardisation"+os.linesep) if (retag_file_name != None): freqs_out.write('#' + os.linesep) freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \ "'"+os.linesep) freqs_out.write('#' + '-' * 70 + os.linesep) freqs_out.write(os.linesep) sorted_seq_freqs = [ ] # Now sort sequences according to their fruequencies for key in seq_freqs.keys(): sorted_seq_freqs.append((len(seq_freqs[key]), key)) sorted_seq_freqs.sort() for skey in sorted_seq_freqs: key = skey[1] freqs_out.write('# Pattern: ' + str(key) + os.linesep) freqs_out.write('# Frequency: ' + str(skey[0]) + os.linesep) examples = seq_freqs[key] freqs_out.write('# Maximum Viterbi probability: '+ \ str(examples[0][1])+os.linesep) freqs_out.write('# Examples: ' + os.linesep) for example in examples: freqs_out.write('# ' + str(example[0]) + os.linesep) freqs_out.write(str(key) + os.linesep) freqs_out.write(os.linesep) freqs_out.close() inout.log_message(['Read '+str(line_read)+' lines, processed '+ \ str(rec_count)+' lines', 'End.'],'v1')
treinar.append(train) print classificar_vetor print(teste.get_dir()) #teste.set_pontas_porcentagem(15) #print(teste.get_pontas()) #organizar #print (organizar[0]) #print(sorted(organizar)) # esse comando deve ser feito usando o primeiro objeto ZERO e nao UM print (sorted(train)) test_hmm_states = ['1', '3', '2'] #test_hmm_observ=[] test_hmm = simplehmm.hmm('15 porcento lncRNA', test_hmm_states, nomes) #print test_hmm_observ ''' temos que colocar todos os arquivos train dentro da tabela com o append por meio de um FOR A TABELA DE TREINAMENTO TEM QUE SAIR ORGANIZADA E NaO ESTa ''' #treinar.append(train) print '---------------------', treinar, '---------------------------------' #print test_hmm.check_prob() test_hmm.train(treinar, smoothing='absdiscount') #print test_hmm.check_prob() print test_hmm.print_hmm() if nome_saida =='': print ('De um nome para os arquivos de saida')
treinamento.append(entrada_vetor) #print entrada_vetor contador += 1 if len(treinamento): entrada_treinamento.append(treinamento) '''TREINAMENTO ''' #cirRNA_hmm= simplehmm.hmm('cirRNA_hmm_primeiro_teste',estados,emissoes) #cirRNA_hmm.train(entrada_treinamento, smoothing='absdiscount') #cirRNA_hmm.save_hmm("circ.hmm") #print cirRNA_hmm.print_hmm() '''VALIDACAO''' cirRNA_hmm = simplehmm.hmm('circ_rna primeiros testes', ['dummy'], ['dummy']) cirRNA_hmm.load_hmm('circ.hmm') #print query[1] #print cirRNA_hmm.print_hmm() print len(query[0]), 'size' for query_out in query: print query_out print cirRNA_hmm.viterbi(query[0])[1] print "------------------------------------------" print cirRNA_hmm.viterbi(query[0])[0] #print cirRNA_hmm.viterbi(query[0])[0].index('BRANCH'),'<-BRANCH->',query[0][cirRNA_hmm.viterbi(query[0])[0].index('BRANCH')] for i in range(1, 10): shuffle(query[0])
def testHMM( self): # - - - - - - - - - - - - - - - - - - - - - - - - - - - - """Test basic HMM functionality""" hmm1 = simplehmm.hmm("Test HMM", self.states, self.observ) assert hmm1.N == len( self.states), ("Illegal number of states in HMM (" + str(hmm1.N) + "), should be: " + str(len(self.states))) assert len(hmm1.S_ind) == len(self.states), ( "Illegal number of states in HMM state dictionary (" + str(len(hmm1.S_ind)) + "), should be: " + str(len(self.states))) assert hmm1.M == len( self.observ), ("Illegal number of observations in HMM (" + str(hmm1.M) + "), should be: " + str(len(self.observ))) assert len(hmm1.O_ind) == len(self.observ), ( "Illegal number of observations in HMM observation dictionary (" + str(len(hmm1.O_ind)) + "), should be: " + str(len(self.observ))) for i in range(hmm1.N): assert hmm1.pi[i] == 0.0, ( "Initial probability in HMM 1 is not 0.0 at location [" + str(i) + "]: " + str(hmm1.pi[i])) for j in range(hmm1.N): assert hmm1.A[i][j] == 0.0, ( "Transition probability in HMM 1 is not 0.0 at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.A[i][j])) for j in range(hmm1.M): assert hmm1.B[i][j] == 0.0, ( "Observation probability in HMM 1 is not 0.0 at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.B[i][j])) hmm1.train(self.train_data) hmm1.check_prob() hmm1.print_hmm() for i in range(hmm1.N): assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), ( "Initial probability in HMM 1 is not between 0.0 and 1.0 at " + "location [" + str(i) + "]: " + str(hmm1.pi[i])) for j in range(hmm1.N): assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), ( "Transition probability in HMM 1 is not between 0.0 and 1.0" + " at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.A[i][j])) for j in range(hmm1.M): assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), ( "Observation probability in HMM 1 is not between 0.0 and " + "1.0 at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.B[i][j])) for test_rec in self.test_data: [state_seq, seq_prob] = hmm1.viterbi(test_rec) for state in state_seq: assert state in self.states, ('Returned state "' + state + '" not in tate list') assert (seq_prob >= 0.0) and ( seq_prob <= 1.0), "Sequence probability is not between 0.0 and 1.0:" + str( seq_prob) hmm1.train(self.train_data, smoothing="laplace") hmm1.check_prob() hmm1.print_hmm() for i in range(hmm1.N): assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), ( "Initial probability in HMM 1 is not between 0.0 and 1.0 at " + "location [" + str(i) + "]: " + str(hmm1.pi[i])) for j in range(hmm1.N): assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), ( "Transition probability in HMM 1 is not between 0.0 and 1.0" + " at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.A[i][j])) for j in range(hmm1.M): assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), ( "Observation probability in HMM 1 is not between 0.0 and " + "1.0 at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.B[i][j])) for test_rec in self.test_data: [state_seq, seq_prob] = hmm1.viterbi(test_rec) for state in state_seq: assert state in self.states, ('Returned state "' + state + '" not in tate list') assert (seq_prob >= 0.0) and ( seq_prob <= 1.0), "Sequence probability is not between 0.0 and 1.0:" + str( seq_prob) hmm1.train(self.train_data, smoothing="absdiscount") hmm1.check_prob() hmm1.print_hmm() for i in range(hmm1.N): assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), ( "Initial probability in HMM 1 is not between 0.0 and 1.0 at " + "location [" + str(i) + "]: " + str(hmm1.pi[i])) for j in range(hmm1.N): assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), ( "Transition probability in HMM 1 is not between 0.0 and 1.0" + " at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.A[i][j])) for j in range(hmm1.M): assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), ( "Observation probability in HMM 1 is not between 0.0 and " + "1.0 at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.B[i][j])) for test_rec in self.test_data: [state_seq, seq_prob] = hmm1.viterbi(test_rec) for state in state_seq: assert state in self.states, ('Returned state "' + state + '" not in tate list') assert (seq_prob >= 0.0) and ( seq_prob <= 1.0), "Sequence probability is not between 0.0 and 1.0:" + str( seq_prob) hmm1.save_hmm("testhmm.hmm") hmm2 = hmm1 hmm2 = simplehmm.hmm("Test2 HMM", ["dummy"], ["dummy"]) hmm2.load_hmm("testhmm.hmm") assert hmm1.N == hmm2.N, "Loaded HMM has differnt number of states" assert hmm1.M == hmm2.M, "Loaded HMM has differnt number of observations" for i in range(hmm1.N): assert abs(hmm1.pi[i] - hmm2.pi[i]) < self.delta, ( "Initial probability in HMM 1 is different from HMM 2: " + str(hmm1.pi[i]) + " / " + str(hmm2.pi[i])) for j in range(hmm1.N): assert abs(hmm1.A[i][j] - hmm2.A[i][j]) < self.delta, ( "Transition probability in HMM 1 is different from HMM 2 " + "at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.A[i][j]) + " / " + str(hmm2.A[i][j])) for j in range(hmm1.M): assert abs(hmm1.B[i][j] - hmm1.B[i][j]) < self.delta, ( "Observation probability in HMM 1 is different from HMM 2 " + "at location [" + str(i) + "," + str(j) + "]: " + str(hmm1.B[i][j]) + " / " + str(hmm2.B[i][j]))
def standard(): """Main routine, open file, read lines, standardise them and write into file. USAGE: standard() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 2): print '***** Error: %s needs at least three arguments:' % (sys.argv[0]) print '***** - Name of the project module' print '***** - Number of the first record to be processed' print '***** - Number of records to be processed' print '***** plus options' raise Exception() first_rec = int(config.options[0]) num_rec = int(config.options[1]) in_file_name = config.in_file_name out_file_name = config.out_file_name # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file write_header = 0 # Write header (output field names) to output file # (default: Don't) config.nowarn = 0 # Deactivate no warning flag (print/log warning messages) if (len(config.options) > 2): options = config.options[2:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file, 'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file:', config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write( '##################################################') f_log.write("############" + os.linesep) f_log.write("#" + os.linesep) f_log.write( "# 'pyStandard.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time()) + os.linesep) f_log.write("#" + os.linesep) f_log.write("# Input file name: " + in_file_name + os.linesep) f_log.write("# Output file name: " + out_file_name + os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-h'): write_header = 1 options = options[1:] # Remove processed -'h' option elif (options[0] == '-hmm-name'): hmm_name_file = options[ 1] # Get file name of the name HMM to use try: f_in = open(hmm_name_file, 'r') # Test if file is available except: print '***** Error ********************', print '***** Cannot open HMM file in "-hmm-name" option:', print hmm_name_file raise IOError() f_in.close() options = options[2:] # Remove processed option and file name config.name_standard_method = 'hmm' config.name_hmm_file_name = hmm_name_file config.name_hmm = simplehmm.hmm( [], []) # Create new empty HMM object config.name_hmm.load_hmm(config.name_hmm_file_name) elif (options[0] == '-hmm-loc'): hmm_loc_file = options[ 1] # Get file name of the locality HMM to use try: f_in = open(hmm_loc_file, 'r') # Test if file is available except: print '***** Error ********************', print '***** Cannot open HMM file in "-hmm-loc" option:', print hmm_loc_file raise IOError() f_in.close() options = options[2:] # Remove processed option and file name config.geoloc_standard_method == 'hmm' config.geoloc_hmm_file_name = hmm_loc_file config.geoloc_hmm = simplehmm.hmm([], []) # Create new HMM object config.geoloc_hmm.load_hmm(config.geoloc_hmm_file_name) else: print '***** Error: Illegal option:', options[0] raise Exception() # Open input file and check number of available records - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() line_count = 0 for line in f_in.xreadlines(): line_count += 1 f_in.close() if ((first_rec + num_rec) > line_count): # Illegal value for last record print '***** Error: Illegal values for number of records to process:', print num__rec, ', with start record:', start_rec print '***** File only contains', line_count, 'lines/records' raise Exception() # Open files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() try: f_out = open(out_file_name, 'w') except: inout.log_message('Cannot open output file: ' + out_file_name, 'err') raise IOError() # Write header (name of output fields) into output file - - - - - - - - - - - # if (write_header == 1): header_dict = {} for n in config.output_field_names: header_dict.update({n: n}) # Dictionary where values are field names header_line = inout.compose_line(header_dict, header=1) f_out.write(header_line + os.linesep) # Skip over records - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # if (first_rec > 0): for i in range(first_rec): f_in.readline() # Read lines, process them and write into output files - - - - - - - - - - - # line_read = 0 # Number of read lines while (line_read < num_rec): # Loop until 'num_rec' records processed line = f_in.readline() # Print process indicator message # if (config.proc_ind >= 0) and (line_read > 0): # Only print if activated if (line_read % config.proc_ind == 0): print 'Processed line', line_read, 'of', num_rec line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line = line.lower() # Make all characters lower case inout.log_message(['Record ' + str(line_read + first_rec)], 'v1') config.curr_line_no = line_read + first_rec # Store current line number # Process line and extract content into components (name, geocode, etc.) # [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \ inout.process_line(line) # Make a local empty working copy of the output field dictionary - - - - - # output_fields = config.output_field.copy() output_fields_keys = output_fields.keys() for k in output_fields_keys: output_fields[k] = '' # Set all fields to an empty string # Standardise name component - - - - - - - - - - - - - - - - - - - - - - - # if (type(name_comp) == types.ListType ): # Givenname and surname separate givenname_comp = name_comp[0].strip() surname_comp = name_comp[1].strip() if (givenname_comp != ''): # There is a givenname - - - - - - - - - - - inout.log_message( ' Givenname component: |' + givenname_comp + '|', 'v1') givenname_comp = name.clean_name_component(givenname_comp) [name_list, tag_list] = name.tag_name_component(givenname_comp) output_fields['gender_guess'] = name.get_gender_guess(name_list, \ tag_list) [name_list, tag_list, output_fields['title']] = \ name.get_title(name_list, tag_list) [output_fields['givenname'], output_fields['alt_givenname']] = \ name.get_name_component(name_list, tag_list, 'gname') if (surname_comp != ''): # There is a surname - - - - - - - - - - - - - inout.log_message( ' Surname component: |' + surname_comp + '|', 'v1') surname_comp = name.clean_name_component(surname_comp) [name_list, tag_list] = name.tag_name_component(surname_comp) [output_fields['surname'], output_fields['alt_surname']] = \ name.get_name_component(name_list, tag_list, 'sname') elif (name_comp.strip() != ''): # Given- and surname both in one field - - inout.log_message(' Name component: |' + name_comp + '|', 'v1') name_comp = name.clean_name_component(name_comp) [name_list, tag_list] = name.tag_name_component(name_comp) output_fields['gender_guess'] = name.get_gender_guess( name_list, tag_list) [name_list, tag_list, output_fields['title']] = \ name.get_title(name_list, tag_list) if (config.name_standard_method == 'rules'): name_dict = name.get_names_rules(name_list, tag_list, 'gname') elif (config.name_standard_method == 'hmm'): name_dict = name.get_names_hmm(name_list, tag_list) else: inout.log_message('Illegal name standardisation method:'+ \ config.name_standard_method,'err') raise Exception() for (field, value) in name_dict.items(): # Assign to output dictionary output_fields[field] = value # Standardise geocode and locality components using HMM - - - - - - - - - - # if (config.geoloc_standard_method == 'hmm') and \ ((geocode_comp.strip() != '') or (locality_comp.strip() != '')): geoloc_comp = geocode_comp.strip() + ' ' + locality_comp.strip() inout.log_message(' Geocode and locality component: |'+geoloc_comp+'|',\ 'v1') geoloc_comp = locality.clean_geoloc_component(geoloc_comp) [geoloc_words, geoloc_tags] = locality.tag_geoloc_component(geoloc_comp) if (geoloc_words != []): # Component not empty, do HMM standardisation geoloc_dict = locality.get_geoloc_hmm(geoloc_words, geoloc_tags) for (field, value ) in geoloc_dict.items(): # Assign to output dictionary output_fields[field] = value # Standardise geocode component using rules - - - - - - - - - - - - - - - - # elif (config.geoloc_standard_method == 'rules') and \ (geocode_comp.strip() != ''): inout.log_message(' Geocode component: |' + geocode_comp + '|', 'v1') ### TO BE DONE inout.log_message('Rules based standardisation for geocode is' + \ 'not implemented yet','err') raise Exception() # Standardise locality component using rules - - - - - - - - - - - - - - - # elif (config.geoloc_standard_method == 'rules') and \ (locality_comp.strip() != ''): inout.log_message(' Locality component: |' + locality_comp + '|', 'v1') ### TO BE FINALISED inout.log_message('Rules based standardisation for locality is' + \ 'not implemented yet','err') raise Exception() # locality_comp = locality.clean_geoloc_component(locality_comp) # [loc_words, loc_tags] = locality.tag_geoloc_component(locality_comp) # # [terr,loc_words2,loc_tags2] = locality.get_territory(loc_words,loc_tags) # if (terr != ''): # output_fields['territory'] = terr # # [pc,loc_words3,loc_tags3] = locality.get_postcode(loc_words2,loc_tags2) # if (pc != ''): # output_fields['postcode'] = pc # # [loc_name, loc_quali, loc_words4, loc_tags4] = \ # locality.get_localityname_qualifier(loc_words3, loc_tags3) # if (loc_name != ''): # output_fields['locality_name'] = loc_name # if (loc_quali != ''): # output_fields['locality_quali'] = loc_quali # # if (loc_words4 != []): # Not all words are standardised yet # print ' # Remaining word list:', loc_words4 ###### TEST # print ' # Remaining tag list: ', loc_tags4 ###### TEST # Standardise date strings - - - - - - - - - - - - - - - - - - - - - - - - # if (date1_comp != ''): inout.log_message(' Date1 component: |' + date1_comp + '|', 'v1') [day1, month1, year1, status1] = date.parse_datestr(date1_comp) if (day1 != -1): output_fields['day1'] = str(day1) if (month1 != -1): output_fields['month1'] = str(month1) if (year1 != -1): output_fields['year1'] = str(year1) if (date2_comp != ''): inout.log_message(' Date2 component: |' + date2_comp + '|', 'v1') [day2, month2, year2, status2] = date.parse_datestr(date2_comp) if (day2 != -1): output_fields['day2'] = str(day2) if (month2 != -1): output_fields['month2'] = str(month2) if (year2 != -1): output_fields['year2'] = str(year2) # Create log message of output fields - - - - - - - - - - - - - - - - - - - # msg = [' Standardised record output fields:'] for (field, value) in output_fields.items(): if (value != '') and (value != []): msg.append(' ' + field + ':' + str(value)) inout.log_message(msg, 'v1') # Save standardised record into output field # out_line = inout.compose_line(output_fields) f_out.write(out_line + os.linesep) # Increment line counter and go to beginning of loop - - - - - - - - - - - # line_read += 1 inout.log_message('', 'v1') # Print empty lines between records # Close files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # f_in.close() f_out.close() msg = ['','Number of warnings: '+str(config.num_warning), \ 'Number of corrected word spillings: '+str(config.num_word_spills)] inout.log_message(msg, 'v1') print msg[1] print msg[2] inout.log_message('End.', 'v1')