Exemplo n.º 1
0
def main():
    # Initialise a new HMM and train it

    test_hmm = simplehmm.hmm('Test HMM', test_hmm_states, test_hmm_observ)
    test_hmm.train(train_data)  # Train the HMM

    test_hmm.check_prob()  # Check its probabilities
    test_hmm.print_hmm()  # Print it out

    # Apply the Viterbi algorithm to each sequence of the test data

    for test_rec in test_data:
        [state_sequence, sequence_probability] = test_hmm.viterbi(test_rec)

    # Initialise and train a second HMM using the same training data and
    # applying Laplace smoothing

    test_hmm2 = simplehmm.hmm('Test HMM 2', test_hmm_states, test_hmm_observ)
    test_hmm2.train(train_data, smoothing='laplace')

    # Save the second  HMM into a text file

    test_hmm2.save_hmm('testhmm2.hmm')

    # Initialise a third HMM, then load the previously saved HMM into it

    test_hmm3 = simplehmm.hmm('Test HMM 3', ['dummy'], ['dummy'])
    test_hmm3.load_hmm('testhmm2.hmm')
    test_hmm3.print_hmm()  # Print it out
Exemplo n.º 2
0
def tracker(data, rid, trange):

    train_data = []
    i = 0
    while i < len(data):
        pre = []
        for j in range(trange):
            i = i + j
            if i >= len(data):
                break
            else:
                pre.append((data[i][rid], data[i][2]))

        train_data.append(pre)

    states = ['1', '2', '3', '4', '5']

    observ = ["".join(seq) for seq in itertools.product("01", repeat=5)]

    hmm1 = simplehmm.hmm('Test HMM', states, observ)
    hmm1.train(train_data)
    return hmm1
Exemplo n.º 3
0
def tracker(data, rid, trange):

	train_data=[]
	i=0
	while i< len(data):
		pre=[]
		for j in range(trange):
			i=i+j
			if i >= len(data):
				break
			else:
				pre.append((data[i][rid],data[i][2]))
		
		train_data.append(pre)

	states=['1','2','3','4','5']

	observ=["".join(seq) for seq in itertools.product("01", repeat=5)]


	hmm1 = simplehmm.hmm('Test HMM', states, observ)
	hmm1.train(train_data)
	return hmm1
Exemplo n.º 4
0
print 'Set of tags found in HMM training file:'
print '  %s' % (', '.join(tag_list))
print
print 'Set of HMM states found in HMM training file:'
print '  %s' % (', '.join(state_list))
print

print 'Parsed %d training records:' % (len(train_rec_list))
for train_rec in train_rec_list:
  print '  %s' % (train_rec)
print

# Initalise HMM and train it with training data - - - - - - - - - - - - - - - -
#
hmm_name =   'Febrl HMM based on training file "%s"' % (hmm_training_file)
hmm_states = list(state_set)
hmm_observ = list(tag_set)

train_hmm = simplehmm.hmm(hmm_name, hmm_states, hmm_observ)

# Train, print and save the HMM - - - - - - - - - - - - - - - - - - - - - - - -
#
train_hmm.train(train_rec_list, hmm_smoothing)
train_hmm.print_hmm()

# Save trained HMM  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#
train_hmm.save_hmm(hmm_model_file)

# =============================================================================
Exemplo n.º 5
0
  def testHMM(self):  # - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    """Test basic HMM functionality"""

    hmm1 = simplehmm.hmm('Test HMM', self.states, self.observ)

    assert (hmm1.N == len(self.states)), \
           'Illegal number of states in HMM ('+str(hmm1.N)+'), should be: '+ \
           str(len(self.states))
    assert (len(hmm1.S_ind) == len(self.states)), \
           'Illegal number of states in HMM state dictionary ('+ \
           str(len(hmm1.S_ind))+'), should be: '+str(len(self.states))

    assert (hmm1.M == len(self.observ)), \
           'Illegal number of observations in HMM ('+str(hmm1.M)+ \
           '), should be: '+str(len(self.observ))
    assert (len(hmm1.O_ind) == len(self.observ)), \
           'Illegal number of observations in HMM observation dictionary ('+ \
           str(len(hmm1.O_ind))+'), should be: '+ str(len(self.observ))

    for i in range(hmm1.N):
      assert (hmm1.pi[i] == 0.0), \
             'Initial probability in HMM 1 is not 0.0 at location ['+ \
             str(i)+']: '+str(hmm1.pi[i])

      for j in range(hmm1.N):
         assert (hmm1.A[i][j] == 0.0), \
                'Transition probability in HMM 1 is not 0.0 at location ['+ \
                str(i)+','+str(j)+']: '+str(hmm1.A[i][j])
      for j in range(hmm1.M):
         assert (hmm1.B[i][j] == 0.0), \
                'Observation probability in HMM 1 is not 0.0 at location ['+ \
                str(i)+','+str(j)+']: '+str(hmm1.B[i][j])

    hmm1.train(self.train_data)
    hmm1.check_prob()
    hmm1.print_hmm()

    for i in range(hmm1.N):
      assert ((hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0)), \
             'Initial probability in HMM 1 is not between 0.0 and 1.0 at '+ \
             'location ['+str(i)+']: '+str(hmm1.pi[i])

      for j in range(hmm1.N):
         assert ((hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0)), \
                'Transition probability in HMM 1 is not between 0.0 and 1.0'+ \
                ' at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j])
      for j in range(hmm1.M):
         assert ((hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0)), \
                'Observation probability in HMM 1 is not between 0.0 and '+ \
                '1.0 at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j])

    for test_rec in self.test_data:
      [state_seq, seq_prob] = hmm1.viterbi(test_rec)

      for state in state_seq:
        assert (state in self.states), \
               'Returned state "'+state+'" not in tate list'
      assert ((seq_prob >= 0.0) and (seq_prob <= 1.0)), \
            'Sequence probability is not between 0.0 and 1.0:'+ str(seq_prob)

    hmm1.train(self.train_data,smoothing='laplace')
    hmm1.check_prob()
    hmm1.print_hmm()

    for i in range(hmm1.N):
      assert ((hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0)), \
             'Initial probability in HMM 1 is not between 0.0 and 1.0 at '+ \
             'location ['+str(i)+']: '+str(hmm1.pi[i])

      for j in range(hmm1.N):
         assert ((hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0)), \
                'Transition probability in HMM 1 is not between 0.0 and 1.0'+ \
                ' at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j])
      for j in range(hmm1.M):
         assert ((hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0)), \
                'Observation probability in HMM 1 is not between 0.0 and '+ \
                '1.0 at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j])

    for test_rec in self.test_data:
      [state_seq, seq_prob] = hmm1.viterbi(test_rec)

      for state in state_seq:
        assert (state in self.states), \
               'Returned state "'+state+'" not in tate list'
      assert ((seq_prob >= 0.0) and (seq_prob <= 1.0)), \
            'Sequence probability is not between 0.0 and 1.0:'+ str(seq_prob)

    hmm1.train(self.train_data,smoothing='absdiscount')
    hmm1.check_prob()
    hmm1.print_hmm()

    for i in range(hmm1.N):
      assert ((hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0)), \
             'Initial probability in HMM 1 is not between 0.0 and 1.0 at '+ \
             'location ['+str(i)+']: '+str(hmm1.pi[i])

      for j in range(hmm1.N):
         assert ((hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0)), \
                'Transition probability in HMM 1 is not between 0.0 and 1.0'+ \
                ' at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j])
      for j in range(hmm1.M):
         assert ((hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0)), \
                'Observation probability in HMM 1 is not between 0.0 and '+ \
                '1.0 at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j])

    for test_rec in self.test_data:
      [state_seq, seq_prob] = hmm1.viterbi(test_rec)

      for state in state_seq:
        assert (state in self.states), \
               'Returned state "'+state+'" not in tate list'
      assert ((seq_prob >= 0.0) and (seq_prob <= 1.0)), \
            'Sequence probability is not between 0.0 and 1.0:'+ str(seq_prob)

    hmm1.save_hmm('testhmm.hmm')
    hmm2 = hmm1

    hmm2 = simplehmm.hmm('Test2 HMM', ['dummy'], ['dummy'])

    hmm2.load_hmm('testhmm.hmm')

    assert (hmm1.N == hmm2.N), \
           'Loaded HMM has differnt number of states'
    assert (hmm1.M == hmm2.M), \
           'Loaded HMM has differnt number of observations'

    for i in range(hmm1.N):
      assert (abs(hmm1.pi[i]- hmm2.pi[i]) < self.delta), \
             'Initial probability in HMM 1 is different from HMM 2: '+ \
             str(hmm1.pi[i])+' / '+str(hmm2.pi[i])

      for j in range(hmm1.N):
         assert (abs(hmm1.A[i][j] - hmm2.A[i][j]) < self.delta), \
                'Transition probability in HMM 1 is different from HMM 2 '+ \
                'at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j])+ \
                ' / '+str(hmm2.A[i][j])

      for j in range(hmm1.M):
         assert (abs(hmm1.B[i][j] - hmm1.B[i][j]) < self.delta), \
                'Observation probability in HMM 1 is different from HMM 2 '+ \
                'at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j])+ \
                ' / '+str(hmm2.B[i][j])
Exemplo n.º 6
0
def trainhmm():
  """Main routine, open file, read lines, train HMM and save it to file.

  USAGE:
    trainhmm()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

  # Process command line arguments and check for correctness  - - - - - - - - -
  #
  if (len(config.options) < 3):
    print '***** Error: %s needs at least four arguments:'% (sys.argv[0])
    print '*****        - Name of the project module'
    print '*****        - Tagging mode: "name" or "locality"'
    print '*****        - Input training file name'
    print '*****        - HMM output file name'
    print '*****          plus options'
    raise Exception()

  if (config.options[1] == config.options[2]):
    print '*** Error: Input and output files must differ'
    print '***        Input training file name:', config.options[1]
    print '***        HMM output file name:    ', config.options[1]
    raise Exception()

  in_file_name  = config.options[1]
  hmm_file_name = config.options[2]

  # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - -
  #
  tag_mode = config.options[0]
  if (tag_mode in ['name','na','n']):
    tag_mode = 'name'
  elif (tag_mode in ['locality','lolty','loc','l']):
    tag_mode = 'loc'
  else:
    print '***** Error: Illegal tagging mode:', tag_mode
    print '*****        Must be either "name" or "locality"'
    raise Exception()

  # Check for optional arguments and process if any - - - - - - - - - - - - - -
  #
  config.verbose = 0     # Default: No verbose output
  config.logging = 0     # Default: No logging into a file
  smoothing      = None  # Default: No smoothing
  config.nowarn  = 0     # Deactivate no warning flag (print/log warning
                         # messages)

  if (len(config.options) > 3):
    options =  config.options[3:]
    while (options != []):  # Do a loop processing all options

      if (options[0] == '-nowarn'):
        config.nowarn = 1  # Activate no warning flag
        options = options[1:]  # Remove processed '-nowarn' option

      elif (options[0] == '-v1'):
        config.verbose = 1  # Set to verbose output level 1
        options = options[1:]  # Remove processed '-v1' option

      elif (options[0] == '-v2'):
        config.verbose = 2  # Set to verbose output level 2
        options = options[1:]  # Remove processed '-v2' option

      elif (options[0] == '-l'):
        config.logging = 1
        if (len(options) > 1):
          if (options[1][0] != '-'):  # Not another option, must be a file name
            config.log_file = options[1]  # Get name of log file
            options = options[1:]  # Remove file_name
        options = options[1:]  # Remove processed -'l' option only

        try:
          f_log = open(config.log_file,'a')  # Test if file is appendable
        except:
          print '***** Error ********************',
          print '***** Cannot write to log file: '+config.log_file
          raise IOError()

        # Write (append) header to log file
        #
        f_log.write(os.linesep)
        f_log.write('##################################################')
        f_log.write("############"+os.linesep)
        f_log.write("#"+os.linesep)
        f_log.write("# 'pyTrainHMM.py - Version 0.1' process started at: ")
        f_log.write(time.ctime(time.time())+os.linesep)
        f_log.write("#"+os.linesep)
        f_log.write("# Input file name: "+in_file_name+os.linesep)
        f_log.write("# HMM file name:   "+hmm_file_name+os.linesep)
        f_log.write(os.linesep)
        f_log.close()

      elif (options[0] == '-s'):
        smoothing = 1  # Set to do a HMM smoothing
        smoothing = options[1]
        if (smoothing in ['l','la','lap','laplac','laplace']):
          smoothing = 'laplace'
        elif (smoothing in ['a','ad','abs','absd','absdis','absdisc',\
               'absdiscount']):
          smoothing = 'absdiscount'
        else:  # Illegal value
          print "*** Error: Illegal value for 'smoothing' argument:", smoothing
          print "***        Possible are: 'laplace' or 'absdiscount'"
          raise Exception()

        options = options[2:]  # Remove processed option

      else:
        print '*** Error: Illegal option:', options[0]
        raise Exception()

  # Get HMM states and observations from configuration module - - - - - - - - -
  #
  if (tag_mode == 'name'): 
    state_list = config.name_hmm_states
    obser_list = config.name_hmm_obser

  else:
    state_list = config.geoloc_hmm_states
    obser_list = config.geoloc_hmm_obser

  # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  try:
    f_in = open(in_file_name,'r')
  except:
    inout.log_message('Cannot open input file: '+in_file_name,'err')
    raise IOError()

  line_count = 0  # Counter for lines read
  rec_count  = 0  # Counter for training records read

  # Read lines, discard comment lines and process training data lines - - - - -
  #
  training_data = []  # List of training records

  train_list = []  # List of training sequences (dictionaries), extracted from
                   # training data

  for line in xreadlines.xreadlines(f_in):

    if (line[0] != '#') and (line.strip() != ''):
      # Line must contain a training record

      line = line.strip()  # Remove line separators
      config.curr_line = line  # Make a copy of the unprocessed current line

      line_list = line.split(',')  # Split into a list of elements
      line_data = []  # Training data list for one training record

      inout.log_message(['Record number: '+str(rec_count)],'v1')
      config.curr_line_no = line_count  # Store current line number

      for elem in line_list:
        [k,v] = elem.split(':')  # Split into key and value
        tag = k.strip()
        state = v.strip()
        line_data.append((state,tag))

        if (state not in state_list):
          msg = ['Illegal state name in training record: '+state, \
                 'Line: '+str(line_count)+', record: '+str(rec_count), \
                 'Possible values: '+str(state_list)]
          inout.log_message(msg,'err')
          raise Exception()

        if (tag not in obser_list):
          msg = ['Illegal observation (tag) name in training record: '+tag, \
                 'Line: '+str(line_count)+', record: '+str(rec_count), \
                 'Possible values: '+str(obser_list)]
          inout.log_message(msg,'err')
          raise Exception()

      inout.log_message('  Training record '+str(rec_count)+':'+ \
                        str(line_data),'v1')

      train_list.append(line_data)

      rec_count += 1
      inout.log_message('','v1')  # Print empty lines between records

    line_count += 1

  # Close input file  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  f_in.close()

  inout.log_message('','v1')  # Print empty lines between records

  # Initalise HMM and train it with training data - - - - - - - - - - - - - - -
  #
  myhmm = simplehmm.hmm(state_list, obser_list)

  myhmm.train(train_list,smoothing)
  myhmm.print_hmm()

  # Save trained HMM  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  myhmm.save_hmm(hmm_file_name)  

  inout.log_message(['Read '+str(line_count)+' lines, processed '+ \
                    str(rec_count)+' training records', 'End.'],'v1')
Exemplo n.º 7
0
__author__ = 'lucas Ferreira'
import simplehmm
import re
treinamento=[]
arquivo_motivos=open('teste.mtv','r').read()
for motivo in arquivo_motivos.split('\n'):
    motivo_treino=[]
    contador=0
    for x in re.findall('\w',motivo):
        motivo_treino.append([str(contador),x])
        contador+=1
    treinamento.append(motivo_treino)

#print treinamento
cirRNA_hmm=simplehmm.hmm('circ_rna primeiros testes',['0','1','2','3','4','5','6','7','8','9','10'], ['A','C','T','G'])
cirRNA_hmm.train(treinamento, smoothing='absdiscount')
cirRNA_hmm.print_hmm()
cirRNA_hmm.save_hmm('circRNA_FRAGMENTADA.hmm')

Exemplo n.º 8
0
        treinar.append(train)
      
print classificar_vetor 
print(teste.get_dir())

#teste.set_pontas_porcentagem(15)
#print(teste.get_pontas())


#organizar
#print (organizar[0])
#print(sorted(organizar)) # esse comando deve ser feito usando o primeiro objeto ZERO e nao UM
print (sorted(train))
test_hmm_states = ['1','3', '2']
#test_hmm_observ=[]	
test_hmm = simplehmm.hmm('15 porcento lncRNA', test_hmm_states, nomes)
#print test_hmm_observ
'''
temos que colocar todos os arquivos train dentro da tabela com o append por meio de um FOR
A TABELA DE TREINAMENTO TEM QUE SAIR ORGANIZADA E NaO ESTa
'''


#treinar.append(train)
print '---------------------',treinar,'---------------------------------'
#print test_hmm.check_prob()
test_hmm.train(treinar, smoothing='absdiscount')
#print test_hmm.check_prob()
#print test_hmm.print_hmm()
#test_hmm.save_hmm("setubal/15_lncrna.hmm")
Exemplo n.º 9
0
    def testHMM(self):  # - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        """Test basic HMM functionality"""

        hmm1 = simplehmm.hmm("Test HMM", self.states, self.observ)

        assert hmm1.N == len(self.states), (
            "Illegal number of states in HMM (" + str(hmm1.N) + "), should be: " + str(len(self.states))
        )
        assert len(hmm1.S_ind) == len(self.states), (
            "Illegal number of states in HMM state dictionary ("
            + str(len(hmm1.S_ind))
            + "), should be: "
            + str(len(self.states))
        )

        assert hmm1.M == len(self.observ), (
            "Illegal number of observations in HMM (" + str(hmm1.M) + "), should be: " + str(len(self.observ))
        )
        assert len(hmm1.O_ind) == len(self.observ), (
            "Illegal number of observations in HMM observation dictionary ("
            + str(len(hmm1.O_ind))
            + "), should be: "
            + str(len(self.observ))
        )

        for i in range(hmm1.N):
            assert hmm1.pi[i] == 0.0, (
                "Initial probability in HMM 1 is not 0.0 at location [" + str(i) + "]: " + str(hmm1.pi[i])
            )

            for j in range(hmm1.N):
                assert hmm1.A[i][j] == 0.0, (
                    "Transition probability in HMM 1 is not 0.0 at location ["
                    + str(i)
                    + ","
                    + str(j)
                    + "]: "
                    + str(hmm1.A[i][j])
                )
            for j in range(hmm1.M):
                assert hmm1.B[i][j] == 0.0, (
                    "Observation probability in HMM 1 is not 0.0 at location ["
                    + str(i)
                    + ","
                    + str(j)
                    + "]: "
                    + str(hmm1.B[i][j])
                )

        hmm1.train(self.train_data)
        hmm1.check_prob()
        hmm1.print_hmm()

        for i in range(hmm1.N):
            assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), (
                "Initial probability in HMM 1 is not between 0.0 and 1.0 at "
                + "location ["
                + str(i)
                + "]: "
                + str(hmm1.pi[i])
            )

            for j in range(hmm1.N):
                assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), (
                    "Transition probability in HMM 1 is not between 0.0 and 1.0"
                    + " at location ["
                    + str(i)
                    + ","
                    + str(j)
                    + "]: "
                    + str(hmm1.A[i][j])
                )
            for j in range(hmm1.M):
                assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), (
                    "Observation probability in HMM 1 is not between 0.0 and "
                    + "1.0 at location ["
                    + str(i)
                    + ","
                    + str(j)
                    + "]: "
                    + str(hmm1.B[i][j])
                )

        for test_rec in self.test_data:
            [state_seq, seq_prob] = hmm1.viterbi(test_rec)

            for state in state_seq:
                assert state in self.states, 'Returned state "' + state + '" not in tate list'
            assert (seq_prob >= 0.0) and (seq_prob <= 1.0), "Sequence probability is not between 0.0 and 1.0:" + str(
                seq_prob
            )

        hmm1.train(self.train_data, smoothing="laplace")
        hmm1.check_prob()
        hmm1.print_hmm()

        for i in range(hmm1.N):
            assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), (
                "Initial probability in HMM 1 is not between 0.0 and 1.0 at "
                + "location ["
                + str(i)
                + "]: "
                + str(hmm1.pi[i])
            )

            for j in range(hmm1.N):
                assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), (
                    "Transition probability in HMM 1 is not between 0.0 and 1.0"
                    + " at location ["
                    + str(i)
                    + ","
                    + str(j)
                    + "]: "
                    + str(hmm1.A[i][j])
                )
            for j in range(hmm1.M):
                assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), (
                    "Observation probability in HMM 1 is not between 0.0 and "
                    + "1.0 at location ["
                    + str(i)
                    + ","
                    + str(j)
                    + "]: "
                    + str(hmm1.B[i][j])
                )

        for test_rec in self.test_data:
            [state_seq, seq_prob] = hmm1.viterbi(test_rec)

            for state in state_seq:
                assert state in self.states, 'Returned state "' + state + '" not in tate list'
            assert (seq_prob >= 0.0) and (seq_prob <= 1.0), "Sequence probability is not between 0.0 and 1.0:" + str(
                seq_prob
            )

        hmm1.train(self.train_data, smoothing="absdiscount")
        hmm1.check_prob()
        hmm1.print_hmm()

        for i in range(hmm1.N):
            assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), (
                "Initial probability in HMM 1 is not between 0.0 and 1.0 at "
                + "location ["
                + str(i)
                + "]: "
                + str(hmm1.pi[i])
            )

            for j in range(hmm1.N):
                assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), (
                    "Transition probability in HMM 1 is not between 0.0 and 1.0"
                    + " at location ["
                    + str(i)
                    + ","
                    + str(j)
                    + "]: "
                    + str(hmm1.A[i][j])
                )
            for j in range(hmm1.M):
                assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), (
                    "Observation probability in HMM 1 is not between 0.0 and "
                    + "1.0 at location ["
                    + str(i)
                    + ","
                    + str(j)
                    + "]: "
                    + str(hmm1.B[i][j])
                )

        for test_rec in self.test_data:
            [state_seq, seq_prob] = hmm1.viterbi(test_rec)

            for state in state_seq:
                assert state in self.states, 'Returned state "' + state + '" not in tate list'
            assert (seq_prob >= 0.0) and (seq_prob <= 1.0), "Sequence probability is not between 0.0 and 1.0:" + str(
                seq_prob
            )

        hmm1.save_hmm("testhmm.hmm")
        hmm2 = hmm1

        hmm2 = simplehmm.hmm("Test2 HMM", ["dummy"], ["dummy"])

        hmm2.load_hmm("testhmm.hmm")

        assert hmm1.N == hmm2.N, "Loaded HMM has differnt number of states"
        assert hmm1.M == hmm2.M, "Loaded HMM has differnt number of observations"

        for i in range(hmm1.N):
            assert abs(hmm1.pi[i] - hmm2.pi[i]) < self.delta, (
                "Initial probability in HMM 1 is different from HMM 2: " + str(hmm1.pi[i]) + " / " + str(hmm2.pi[i])
            )

            for j in range(hmm1.N):
                assert abs(hmm1.A[i][j] - hmm2.A[i][j]) < self.delta, (
                    "Transition probability in HMM 1 is different from HMM 2 "
                    + "at location ["
                    + str(i)
                    + ","
                    + str(j)
                    + "]: "
                    + str(hmm1.A[i][j])
                    + " / "
                    + str(hmm2.A[i][j])
                )

            for j in range(hmm1.M):
                assert abs(hmm1.B[i][j] - hmm1.B[i][j]) < self.delta, (
                    "Observation probability in HMM 1 is different from HMM 2 "
                    + "at location ["
                    + str(i)
                    + ","
                    + str(j)
                    + "]: "
                    + str(hmm1.B[i][j])
                    + " / "
                    + str(hmm2.B[i][j])
                )
  def testHMM(self):  # - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    """Test basic HMM functionality"""

    hmm1 = simplehmm.hmm('Test HMM', self.states, self.observ)

    assert (hmm1.N == len(self.states)), \
           'Illegal number of states in HMM ('+str(hmm1.N)+'), should be: '+ \
           str(len(self.states))
    assert (len(hmm1.S_ind) == len(self.states)), \
           'Illegal number of states in HMM state dictionary ('+ \
           str(len(hmm1.S_ind))+'), should be: '+str(len(self.states))

    assert (hmm1.M == len(self.observ)), \
           'Illegal number of observations in HMM ('+str(hmm1.M)+ \
           '), should be: '+str(len(self.observ))
    assert (len(hmm1.O_ind) == len(self.observ)), \
           'Illegal number of observations in HMM observation dictionary ('+ \
           str(len(hmm1.O_ind))+'), should be: '+ str(len(self.observ))

    for i in range(hmm1.N):
      assert (hmm1.pi[i] == 0.0), \
             'Initial probability in HMM 1 is not 0.0 at location ['+ \
             str(i)+']: '+str(hmm1.pi[i])

      for j in range(hmm1.N):
         assert (hmm1.A[i][j] == 0.0), \
                'Transition probability in HMM 1 is not 0.0 at location ['+ \
                str(i)+','+str(j)+']: '+str(hmm1.A[i][j])
      for j in range(hmm1.M):
         assert (hmm1.B[i][j] == 0.0), \
                'Observation probability in HMM 1 is not 0.0 at location ['+ \
                str(i)+','+str(j)+']: '+str(hmm1.B[i][j])

    hmm1.train(self.train_data)
    hmm1.check_prob()
    hmm1.print_hmm()

    for i in range(hmm1.N):
      assert ((hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0)), \
             'Initial probability in HMM 1 is not between 0.0 and 1.0 at '+ \
             'location ['+str(i)+']: '+str(hmm1.pi[i])

      for j in range(hmm1.N):
         assert ((hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0)), \
                'Transition probability in HMM 1 is not between 0.0 and 1.0'+ \
                ' at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j])
      for j in range(hmm1.M):
         assert ((hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0)), \
                'Observation probability in HMM 1 is not between 0.0 and '+ \
                '1.0 at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j])

    for test_rec in self.test_data:
      [state_seq, seq_prob] = hmm1.viterbi(test_rec)

      for state in state_seq:
        assert (state in self.states), \
               'Returned state "'+state+'" not in tate list'
      assert ((seq_prob >= 0.0) and (seq_prob <= 1.0)), \
            'Sequence probability is not between 0.0 and 1.0:'+ str(seq_prob)

    hmm1.train(self.train_data,smoothing='laplace')
    hmm1.check_prob()
    hmm1.print_hmm()

    for i in range(hmm1.N):
      assert ((hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0)), \
             'Initial probability in HMM 1 is not between 0.0 and 1.0 at '+ \
             'location ['+str(i)+']: '+str(hmm1.pi[i])

      for j in range(hmm1.N):
         assert ((hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0)), \
                'Transition probability in HMM 1 is not between 0.0 and 1.0'+ \
                ' at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j])
      for j in range(hmm1.M):
         assert ((hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0)), \
                'Observation probability in HMM 1 is not between 0.0 and '+ \
                '1.0 at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j])

    for test_rec in self.test_data:
      [state_seq, seq_prob] = hmm1.viterbi(test_rec)

      for state in state_seq:
        assert (state in self.states), \
               'Returned state "'+state+'" not in tate list'
      assert ((seq_prob >= 0.0) and (seq_prob <= 1.0)), \
            'Sequence probability is not between 0.0 and 1.0:'+ str(seq_prob)

    hmm1.train(self.train_data,smoothing='absdiscount')
    hmm1.check_prob()
    hmm1.print_hmm()

    for i in range(hmm1.N):
      assert ((hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0)), \
             'Initial probability in HMM 1 is not between 0.0 and 1.0 at '+ \
             'location ['+str(i)+']: '+str(hmm1.pi[i])

      for j in range(hmm1.N):
         assert ((hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0)), \
                'Transition probability in HMM 1 is not between 0.0 and 1.0'+ \
                ' at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j])
      for j in range(hmm1.M):
         assert ((hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0)), \
                'Observation probability in HMM 1 is not between 0.0 and '+ \
                '1.0 at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j])

    for test_rec in self.test_data:
      [state_seq, seq_prob] = hmm1.viterbi(test_rec)

      for state in state_seq:
        assert (state in self.states), \
               'Returned state "'+state+'" not in tate list'
      assert ((seq_prob >= 0.0) and (seq_prob <= 1.0)), \
            'Sequence probability is not between 0.0 and 1.0:'+ str(seq_prob)

    hmm1.save_hmm('testhmm.hmm')
    hmm2 = hmm1

    hmm2 = simplehmm.hmm('Test2 HMM', ['dummy'], ['dummy'])

    hmm2.load_hmm('testhmm.hmm')

    assert (hmm1.N == hmm2.N), \
           'Loaded HMM has differnt number of states'
    assert (hmm1.M == hmm2.M), \
           'Loaded HMM has differnt number of observations'

    for i in range(hmm1.N):
      assert (abs(hmm1.pi[i]- hmm2.pi[i]) < self.delta), \
             'Initial probability in HMM 1 is different from HMM 2: '+ \
             str(hmm1.pi[i])+' / '+str(hmm2.pi[i])

      for j in range(hmm1.N):
         assert (abs(hmm1.A[i][j] - hmm2.A[i][j]) < self.delta), \
                'Transition probability in HMM 1 is different from HMM 2 '+ \
                'at location ['+str(i)+','+str(j)+']: '+str(hmm1.A[i][j])+ \
                ' / '+str(hmm2.A[i][j])

      for j in range(hmm1.M):
         assert (abs(hmm1.B[i][j] - hmm1.B[i][j]) < self.delta), \
                'Observation probability in HMM 1 is different from HMM 2 '+ \
                'at location ['+str(i)+','+str(j)+']: '+str(hmm1.B[i][j])+ \
                ' / '+str(hmm2.B[i][j])
Exemplo n.º 11
0
                     'locql','pc','ter1','ter2','cntr1','cntr2','rubb']
geoloc_hmm_obser  = ['PC','N4','NU','AN','TR','CR','LN','ST','IN','IT', \
                     'LQ','WT','WN','UT','HY','SL','CO','VB','PA','UN', \
                     'RU']

# =============================================================================
# Dictionary of month name abbreviations (used in date.str2date() routine)

month_abbrev_dict = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6, \
                     'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec': 12}

# =============================================================================
# If Hidden Markov Model standardisation methods are activate load HMM(s)

if (project.name_standard_method == 'hmm'):
    name_hmm = simplehmm.hmm([], [])  # Create new empty HMM object
    name_hmm.load_hmm(project.name_hmm_file_name)

if (project.geoloc_standard_method == 'hmm'):
    geoloc_hmm = simplehmm.hmm([], [])  # Create new empty HMM object
    geoloc_hmm.load_hmm(project.geoloc_hmm_file_name)

# =============================================================================
# List of all supported data file types
#
# File type names must have a length of 3 characters, or 4 characters if the
# file type is quoted (in which case the last character must be a 'Q')
#
# Currently supported file types are:
#   CSV  - Comma separated values, fields separated by commas
#   CSVQ - Comma separated values, where each field starts and ends with
Exemplo n.º 12
0
def tagdata():
  """Main routine, open file, read lines, tag data records, write to out-file.

  USAGE:
    tagdata()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

  # Process command line arguments and check for correctness  - - - - - - - - -
  #
  if (len(config.options) < 5):
    print '***** Error: %s needs at least six arguments:'% (sys.argv[0])
    print '*****        - Name of the project module'
    print '*****        - Tagging mode: "name" or "locality"'
    print '*****        - Output training file name'
    print '*****        - Start of block with training records'
    print '*****        - End of block with training records'
    print '*****        - Number of training records'
    print '*****          plus options'
    raise Exception()

  if (config.in_file_name == config.options[2]):
    print '***** Error: Input and output files must differ'
    print '*****        Input file name:          ', config.in_file_name
    print '*****        Output training file name:', config.options[2]
    raise Exception()

  first_rec = int(config.options[2])
  last_rec  = int(config.options[3])
  num_rec   = int(config.options[4])
  in_file_name = config.in_file_name
  out_file_name = config.options[1]

  # Check record number values  - - - - - - - - - - - - - - - - - - - - - - - -
  #
  if (int(first_rec) >= int(last_rec)) or \
     ((int(num_rec)-1) > (int(last_rec)-int(first_rec))):
    print '***** Error: Illegal values for training records block:'
    print '*****        - Start of block with training records:', first_rec
    print '*****        - End of block with training records:  ', last_rec
    print '*****        - Number of training records:          ', num_rec
    raise Exception()

  rec_range = last_rec-first_rec-1  # Range of records in input file

  # Open input file and check number of available records - - - - - - - - - - -
  #
  try:
    f_in = open(in_file_name,'r')
  except:
    inout.log_message('Cannot open input file: '+in_file_name,'err')
    raise IOError()

  line_count = 0
  for line in f_in.xreadlines():
    line_count += 1
  f_in.close()

  if (last_rec > line_count):  # Illegal value for last record
    print '***** Error: Illegal values for last training records:', last_rec
    print '*****        File only contains',line_count, 'lines/records'
    raise Exception()

  # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - -
  #
  tag_mode = config.options[0]
  if (tag_mode in ['name','na','n']):
    tag_mode = 'name'
  elif (tag_mode in ['locality','localty','loc','l']):
    tag_mode = 'loc'
  else:
    print '***** Error: Illegal tagging mode:', tag_mode
    print '*****        Must be either "name" or "locality"'
    raise Exception()

  # Check for optional arguments and process if any - - - - - - - - - - - - - -
  #
  config.verbose = 0     # Default: No verbose output
  config.logging = 0     # Default: No logging into a file
  hmm_file_name  = None  # Default: Do not use HMM to standardise training
                         #          records
  retag_file_name = None # Default: Do not retag an existing training file
  config.nowarn  = 0     # Deactivate no warning flag (print/log warning
                         # messages)
  freqs_file_name = None # Default: Do not write frequencies, no -freqs option

  if (len(config.options) > 5):
    options = config.options[5:]
    while (options != []):  # Do a loop processing all options

      if (options[0] == '-nowarn'):
        config.nowarn = 1  # Activate no warning flag
        options = options[1:]  # Remove processed '-nowarn' option

      elif (options[0] == '-v1'):
        config.verbose = 1  # Set to verbose output level 1
        options = options[1:]  # Remove processed '-v1' option

      elif (options[0] == '-v2'):
        config.verbose = 2  # Set to verbose output level 2
        options = options[1:]  # Remove processed '-v2' option

      elif (options[0] == '-l'):
        config.logging = 1
        if (len(options) > 1):
          if (options[1][0] != '-'):  # Not another option, must be a file name
            config.log_file = options[1]  # Get name of log file
            options = options[1:]  # Remove file_name
        options = options[1:]  # Remove processed -'l' option only

        try:
          f_log = open(config.log_file,'a')  # Test if file is appendable
        except:
          print '***** Error ********************',
          print '***** Cannot write to log file: '+config.log_file
          raise IOError()

        # Write (append) header to log file
        #
        f_log.write(os.linesep)
        f_log.write('##################################################')
        f_log.write('############'+os.linesep)
        f_log.write('#'+os.linesep)
        f_log.write("# 'pyTagData.py - Version 0.1' process started at: ")
        f_log.write(time.ctime(time.time())+os.linesep)
        f_log.write('#'+os.linesep)
        f_log.write("# Input file name:  "+in_file_name+os.linesep)
        f_log.write("# Output file name: "+out_file_name+os.linesep)
        f_log.write("# Tagging mode:     "+tag_mode+os.linesep)
        f_log.write(os.linesep)
        f_log.close()

      elif (options[0] == '-hmm'):
        hmm_file_name = options[1]  # Get file name of the HMM to use
        if (hmm_file_name == out_file_name):
          print '***** Error: HMM file name is the same as output file name!'
          raise Exception()

        try:
          f_in = open(hmm_file_name,'r')  # Test if file is available
        except:
          print '***** Error: Cannot open HMM file specified in "-hmm"',
          print 'option:', hmm_file_name
          raise IOError()
        f_in.close()
        options = options[2:]  # Remove processed '-hmm' option and file name

      elif (options[0] == '-retag'):
        if (hmm_file_name == None) and ('-hmm' not in options):
          print '***** Error: "-retag" option can only be used together with',
          print '"-hmm" option (which is not given).'
          raise Exception()

        retag_file_name = options[1]  # Get file name of the already-tagged
                                      # file to re-process
        if (retag_file_name == out_file_name):
          print '***** Error: Retag file name is the same as output file name!'
          raise Exception()
        elif (retag_file_name == in_file_name):
          print '***** Error: Retag file name is the same as input file name!'
          raise Exception()
        elif (retag_file_name == hmm_file_name):
          print '***** Error: Retag file name is the same as HMM file name!'
          raise Exception()

        try:
          f_in = open(retag_file_name,'r')  # Test if file is available

          # Now gather record numbers and previous tags/states, as well as the
          # original header information. Use a simple state machine to do this.
          #
          tagged_recs  = {}
          cleaned_recs = {}
          original_header_lines = []
          state = -1  # Header lines state
          prevline = ''

          for line in f_in.xreadlines():  # Read training file and process it
            line = line.strip()

            if (state == -1) and (len(line) == 0):  # End of header lines
              state = 0
              prevline = line
              continue

            if (state == -1) and (len(line) > 0) and (line[0] == "#"):
              original_header_lines.append("# " + line)
              prevline = line
              continue
            sline = line.split(' ')

            if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \
               and (sline[2][0] == '(') and (sline[2][-2:] == '):'):
              try:	
                rec = int(sline[1])  # Original record number 
                tagged_recs[rec]  = None
                cleaned_recs[rec] = None
                state = 1
              except:
                pass
              prevline = line
              continue

            if (state == 1) and (len(line) > 0) and (line[0] != '#'):
              tagged_recs[rec]  = line
              cleaned_recs[rec] = prevline
              state = 0
              prevline = line
              continue

            if (state == 1) and (len(line) > 0):
              prevline = line
              continue

          f_in.close()
          tagged_recs_keys = tagged_recs.keys()

          num_rec = len(tagged_recs_keys)  # Override specified numbers
          first_rec = 0
          last_rec = line_count

        except:
          print '***** Error: Cannot open tagged training file specified',
          print 'in "-retag" option:', retag_file_name
          raise IOError()

        options = options[2:]  # Remove processed '-retag' option and file name

      elif (options[0][:5] == '-freq'):
        if (hmm_file_name == None) and ('-hmm' not in options):
          print '***** Error: "-feqs" option can only be used together with',
          print '"-hmm" option (which is not given).'
          raise Exception()

        freqs_file_name = options[1]  # File name to write the frequencies to
        if (freqs_file_name == out_file_name):
          print '***** Error: Frequency file name is the same as output',
          print 'file name!'
          raise Exception()
        elif (freqs_file_name == in_file_name):
          print '***** Error: Frequency file name is the same as input',
          print 'file name!'
          raise Exception()
        elif (freqs_file_name == hmm_file_name):
          print '***** Error: Frequency file name is the same as HMM',
          print 'file name!'
          raise Exception()

        options = options[2:]  # Remove processed '-freqs' option and file name
        try:  # Check if file writing is possible
          freqs_out = open(freqs_file_name,'w')
	  freqs_out.close()
        except:
          print '***** Error: Cannot write to frequency output file specified',
          print 'in "-freqs" option:', freqs_file_name
          raise IOError()

      else:
        print '***** Error: Illegal option:', options[0]
        raise Exception()

  # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - -
  #
  if (hmm_file_name != None):
    myhmm = simplehmm.hmm([],[])  # Create new empty HMM object
    myhmm.load_hmm(hmm_file_name)
    myhmm.print_hmm()  # Print HMM (according to verbose and logging level)

  # Open output file and write header - - - - - - - - - - - - - - - - - - - - -
  #
  try:
    f_out = open(out_file_name,'w')
  except:
    inout.log_message('Cannot open output file: '+out_file_name,'err')
    raise IOError()

  f_out.write("# Tagged training data written by 'pyTagData.py -"+ \
              " Version 0.1'"+os.linesep)
  f_out.write('#'+os.linesep)
  f_out.write('# Created '+time.ctime(time.time())+os.linesep)
  f_out.write('#'+os.linesep)
  f_out.write('# Input file name:  '+in_file_name+os.linesep)
  f_out.write('# Output file name: '+out_file_name+os.linesep)
  f_out.write('#'+os.linesep)
  f_out.write('# Parameters:'+os.linesep)
  f_out.write('# - Start of block with training records: '+str(first_rec)+ \
              os.linesep)
  f_out.write('# - End of block with training records:   '+str(last_rec)+ \
              os.linesep)
  f_out.write('# - Number of training records:           '+str(num_rec)+ \
              os.linesep)
  if (hmm_file_name != None):
    f_out.write('#'+os.linesep)
    f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \
                os.linesep)
  if (retag_file_name != None):
    f_out.write('#'+os.linesep)
    f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \
                os.linesep)
    f_out.write("#   Header lines from original training file follow:" + \
                os.linesep)
    for header_line in original_header_lines:
	    f_out.write(header_line + os.linesep)
  if (freqs_file_name != None):
    f_out.write('#'+os.linesep)
    f_out.write("# - Tag/state pattern frequencies written to file '" + \
                freqs_file_name + os.linesep)
  f_out.write('#'+'-'*70+os.linesep)
  f_out.write(os.linesep)

  rec_count    = 0        # Number of selected records
  num_rec_left = num_rec  # Number of records to be selected left
  rec_selected = {}       # Dictionary of all record numbers that were selected
  seq_freqs = {}          # Dict to hold examples of tag/state patterns

  unchanged_loop_cnt = 0       # Counter of how many loops have been done
                               # without new records being selected
  prev_num_rec_left = num_rec  # Number of records left in the previous
                               # interation

  # Due to the random nature of selecting records, and because sometimes  - - -
  # a selected component can be empty (and is thus not used for training)
  # more than one iteration over the input data set is carried out. In each 
  # iteration, records are selected randomly.
  #
  while (rec_count < num_rec):  # Loop until 'num_rec' records selected

    # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    try:
      f_in = open(in_file_name,'r')
    except:
      inout.log_message('Cannot open input file: '+in_file_name,'err')
      raise IOError()

    line_read = 0  # Number of read lines

    # Skip to start of training block - - - - - - - - - - - - - - - - - - - - -
    #
    if (first_rec > 0):
      for i in range(first_rec):
        f_in.readline()

    while (rec_count < num_rec) and (line_read <= (last_rec-first_rec)):
      line = f_in.readline()

      if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \
         ((retag_file_name == None) and \
          (num_rec_left >= random.randrange(0,rec_range,1))):

        line = line.strip()  # Remove line separators
        config.curr_line = line  # Make a copy of the unprocessed current line

        line = line.lower()  # Make all characters lower case

        inout.log_message(['Record number: '+str(line_read+first_rec)],'v1')
        config.curr_line_no = line_read+first_rec  # Store current line number

        # Process line and extract content into components (name, geocode, etc)
        #
        [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \
           inout.process_line(line)

        # Select component and process it - - - - - - - - - - - - - - - - - - -
        #
        if (tag_mode == 'name'):
          if (type(name_comp) == types.ListType):
            component = name_comp[0].strip()+' '+name_comp[1].strip()
          else:
            component = name_comp.strip()
        else:  # Locality component
          component = geocode_comp.strip()+' '+locality_comp.strip()

        if (component != '') and \
           (not rec_selected.has_key((line_read+first_rec))):

          if (tag_mode == 'name'):
            inout.log_message('  Name component: |'+component+'|','v1')

            component = name.clean_name_component(component)
            [word_list, tag_list] = name.tag_name_component(component)
 
          else:  # Locality component
            inout.log_message('  Locality component: |'+component+'|','v1')

            component = locality.clean_geoloc_component(component)
            [word_list, tag_list] = locality.tag_geoloc_component(component)

          if (tag_list != []):  # Only process non-empty tag lists

            # Append record number into dictionary of processed records
            #
            rec_selected.update({(line_read+first_rec):(line_read+first_rec)})

            # Create all permutation sequences of this tag list - - - - - - - -
            #
            tag_seq = mymath.perm_tag_sequence(tag_list)

            inout.log_message(['  Word list: '+str(word_list), \
                               '  Tag list: '+str(tag_list), \
                               '  Tag sequences:'],'v2')

            # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - -
            #
            if (hmm_file_name != None):

              state_seq  = []    # List containing computed HMM state sequences
              max_prob   = -1.0  # maximal probability for a sequence
              max_seq_no = -1    # Number of the seq. with the max. probablity

              # Now give tag sequences to the HMMs to compute state sequences
              #
              i = 0
              for t in tag_seq:
                [obs_seq, prob] = myhmm.viterbi(t)
                state_seq.append(obs_seq)
                if (prob > max_prob):
                  max_prob = prob
                  max_seq_no = i
                i += 1

            # Write original component and resulting tag sequences to output
            #
            f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \
                        '): |'+component+'|'+os.linesep) # Commented original
            num_len = len(str(line_read+first_rec))+len(str(rec_count))+6

            f_out.write('#'+num_len*' '+'|'+' '.join(word_list)+'|'+os.linesep)

            for i in range(len(tag_seq)):
              # Convert each tag sequence into a string for file output
              #
              seq_string = '  '

              if (hmm_file_name != None) and (i != max_seq_no):
                seq_string = '# ' # Comment sequences with not max. probability

              for j in range(len(tag_seq[i])):

                if (hmm_file_name != None):
                  seq_string = seq_string+' '+tag_seq[i][j]+':'+ \
                               state_seq[i][j]+','
                else:
                  seq_string = seq_string+' '+tag_seq[i][j]+':,'

              f_out.write(seq_string[:-1]+os.linesep)  # Write without , at end
              inout.log_message('    '+seq_string[:-1],'v2')

            if (hmm_file_name != None):
              f_out.write('# Maximum Viterbi probability: %0.5f'% \
                          (max_prob) + os.linesep)
              inout.log_message('Maximum Viterbi probability: %0.5f'% \
                                (max_prob), 'v2')

            if (retag_file_name != None) and (tagged_recs[line_read] != None):
              if (tagged_recs[line_read].strip() != seq_string[:-1].strip()):
                f_out.write("# Note: ***** Changed *****" + os.linesep)
                inout.log_message('                      Note:' + \
                                  ' ***** Changed *****','v2')
                f_out.write('# Was: ' + tagged_recs[line_read]+os.linesep)
                            # Write commented original tag sequence
                inout.log_message('Original tag sequence: '+ \
                                  tagged_recs[line_read],'v2')

            f_out.write(os.linesep)  # Write an empty line
            inout.log_message('','v1')  # Print empty lines between records

            if (hmm_file_name != None):
              seq_key = seq_string[:-1]  # Add sequence to dictionary
              if (seq_freqs.has_key(seq_key)):
                seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \
                                          max_prob])
              else:
                seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \
                                      max_prob]]

            rec_count += 1

            # Print process indicator message
            #
            if (config.proc_ind >= 0) and (rec_count > 0):
              if (rec_count % config.proc_ind == 0):
                print 'Processed line', rec_count, 'of', num_rec

      line_read += 1

    f_in.close()

    num_rec_left = num_rec - rec_count

    if (prev_num_rec_left == num_rec_left):  # No new records selected
      unchanged_loop_cnt += 1
    prev_num_rec_left = num_rec_left  # Set to current value

    if (unchanged_loop_cnt > 5):  # Do five loops maximal without selecting
                                  # new records
      config.curr_line_no = -1  # Set to illegal/empty values, as warning is
      config.curr_line    = ''  # not related to the current input line
      inout.log_message(['Can not select more than '+str(rec_count)+ \
                         ' records for training.', \
                         'This is probably due to empty input components.', \
                         'Please reduce value of "num_rec" or increase ' + \
                         'range','between "first_rec" and "last_rec".'],'warn')
      break

    if (num_rec_left < 10):  # Only 10 records left to select
      num_rec_left = num_rec+1  # Set to more than 100% probablity
    elif (num_rec_left < (num_rec / 100.0)):  # Less than 1% records left
      num_rec_left = int(num_rec / 100.0)  # Set to 1%

  f_out.close()

  # If specified, save Viterbi frequencies to a file  - - - - - - - - - - - - -
  #
  if (freqs_file_name != None):
    freqs_out = open(freqs_file_name,'w')  # Open frequency file for writing
    freqs_out.write('# Frequency listing of tag/state patterns written by')
    freqs_out.write('"pyTagData.py - Version 0.1"'+os.linesep)
    freqs_out.write('#'+os.linesep)
    freqs_out.write('# Created '+time.ctime(time.time())+os.linesep)
    freqs_out.write('#'+os.linesep)
    freqs_out.write("# Input file name:  "+in_file_name+os.linesep)
    freqs_out.write("# Output file name: "+out_file_name+os.linesep)
    freqs_out.write(os.linesep)
    freqs_out.write('# Parameters:'+os.linesep)
    freqs_out.write('# - Start of block with training records: '+ \
                    str(first_rec)+os.linesep)
    freqs_out.write('# - End of block with training records:   '+ \
                    str(last_rec)+os.linesep)
    freqs_out.write('# - Number of training records:           '+ \
                    str(num_rec)+os.linesep)
    if (hmm_file_name != None):
      freqs_out.write('#'+os.linesep)
      freqs_out.write("# - Using HMM file '"+hmm_file_name+ \
                      "' for standardisation"+os.linesep)
    if (retag_file_name != None):
      freqs_out.write('#'+os.linesep)
      freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \
                      "'"+os.linesep)
    freqs_out.write('#'+'-'*70+os.linesep)
    freqs_out.write(os.linesep)

    sorted_seq_freqs = []  # Now sort sequences according to their fruequencies
    for key in seq_freqs.keys():
      sorted_seq_freqs.append((len(seq_freqs[key]),key))
    sorted_seq_freqs.sort()

    for skey in sorted_seq_freqs:
      key = skey[1]
      freqs_out.write('# Pattern: '+str(key)+os.linesep)
      freqs_out.write('# Frequency: '+str(skey[0])+os.linesep)
      examples = seq_freqs[key]
      freqs_out.write('# Maximum Viterbi probability: '+ \
                      str(examples[0][1])+os.linesep)
      freqs_out.write('# Examples: '+os.linesep)
      for example in examples:
        freqs_out.write('#    '+str(example[0])+os.linesep)
      freqs_out.write(str(key)+os.linesep)
      freqs_out.write(os.linesep)
    freqs_out.close()

  inout.log_message(['Read '+str(line_read)+' lines, processed '+ \
                    str(rec_count)+' lines', 'End.'],'v1')
Exemplo n.º 13
0
def standard():
  """Main routine, open file, read lines, standardise them and write into file.

  USAGE:
    standard()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

  # Process command line arguments and check for correctness  - - - - - - - - -
  #
  if (len(config.options) < 2):
    print '***** Error: %s needs at least three arguments:'% (sys.argv[0])
    print '*****        - Name of the project module'
    print '*****        - Number of the first record to be processed'
    print '*****        - Number of records to be processed'
    print '*****          plus options'
    raise Exception()

  first_rec = int(config.options[0])
  num_rec   = int(config.options[1])
  in_file_name = config.in_file_name
  out_file_name = config.out_file_name

  # Check for optional arguments and process if any - - - - - - - - - - - - - -
  #
  config.verbose = 0  # Default: No verbose output
  config.logging = 0  # Default: No logging into a file
  write_header   = 0  # Write header (output field names) to output file
                      # (default: Don't)
  config.nowarn  = 0  # Deactivate no warning flag (print/log warning messages)

  if (len(config.options) > 2):
    options = config.options[2:]
    while (options != []):  # Do a loop processing all options

      if (options[0] == '-nowarn'):
        config.nowarn = 1  # Activate no warning flag
        options = options[1:]  # Remove processed '-nowarn' option

      elif (options[0] == '-v1'):
        config.verbose = 1  # Set to verbose output level 1
        options = options[1:]  # Remove processed '-v1' option

      elif (options[0] == '-v2'):
        config.verbose = 2  # Set to verbose output level 2
        options = options[1:]  # Remove processed '-v2' option

      elif (options[0] == '-l'):
        config.logging = 1
        if (len(options) > 1):
          if (options[1][0] != '-'):  # Not another option, must be a file name
            config.log_file = options[1]  # Get name of log file
            options = options[1:]  # Remove file_name
        options = options[1:]  # Remove processed -'l' option only

        try:
          f_log = open(config.log_file,'a')  # Test if file is appendable
        except:
          print '***** Error ********************',
          print '***** Cannot write to log file:', config.log_file
          raise IOError()

        # Write (append) header to log file
        #
        f_log.write(os.linesep)
        f_log.write('##################################################')
        f_log.write("############"+os.linesep)
        f_log.write("#"+os.linesep)
        f_log.write("# 'pyStandard.py - Version 0.1' process started at: ")
        f_log.write(time.ctime(time.time())+os.linesep)
        f_log.write("#"+os.linesep)
        f_log.write("# Input file name:  "+in_file_name+os.linesep)
        f_log.write("# Output file name: "+out_file_name+os.linesep)
        f_log.write(os.linesep)
        f_log.close()

      elif (options[0] == '-h'):
        write_header = 1
        options = options[1:]  # Remove processed -'h' option

      elif (options[0] == '-hmm-name'):
        hmm_name_file = options[1]  # Get file name of the name HMM to use
        try:
          f_in = open(hmm_name_file,'r')  # Test if file is available
        except:
          print '***** Error ********************',
          print '***** Cannot open HMM file in "-hmm-name" option:',
          print hmm_name_file
          raise IOError()

        f_in.close()
        options = options[2:]  # Remove processed option and file name
        config.name_standard_method = 'hmm'
        config.name_hmm_file_name = hmm_name_file
        config.name_hmm = simplehmm.hmm([],[])  # Create new empty HMM object
        config.name_hmm.load_hmm(config.name_hmm_file_name)

      elif (options[0] == '-hmm-loc'):
        hmm_loc_file = options[1]  # Get file name of the locality HMM to use
        try:
          f_in = open(hmm_loc_file,'r')  # Test if file is available
        except:
          print '***** Error ********************',
          print '***** Cannot open HMM file in "-hmm-loc" option:',
          print hmm_loc_file
          raise IOError()
        f_in.close()
        options = options[2:]  # Remove processed option and file name
        config.geoloc_standard_method == 'hmm'
        config.geoloc_hmm_file_name = hmm_loc_file
        config.geoloc_hmm = simplehmm.hmm([],[])  # Create new HMM object
        config.geoloc_hmm.load_hmm(config.geoloc_hmm_file_name)

      else:
        print '***** Error: Illegal option:', options[0]
        raise Exception()

  # Open input file and check number of available records - - - - - - - - - - -
  #
  try:
    f_in = open(in_file_name,'r')
  except:
    inout.log_message('Cannot open input file: '+in_file_name,'err')
    raise IOError()

  line_count = 0
  for line in f_in.xreadlines():
    line_count += 1
  f_in.close()

  if ((first_rec+num_rec) > line_count):  # Illegal value for last record
    print '***** Error: Illegal values for number of records to process:',
    print num__rec, ', with start record:', start_rec
    print '*****        File only contains',line_count, 'lines/records'
    raise Exception()

  # Open files  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  try:
    f_in = open(in_file_name,'r')
  except:
    inout.log_message('Cannot open input file: '+in_file_name,'err')
    raise IOError()

  try:
    f_out = open(out_file_name,'w')
  except:
    inout.log_message('Cannot open output file: '+out_file_name,'err')
    raise IOError()

  # Write header (name of output fields) into output file - - - - - - - - - - -
  #
  if (write_header == 1):
    header_dict = {}
    for n in config.output_field_names:
      header_dict.update({n:n})  # Dictionary where values are field names

    header_line = inout.compose_line(header_dict,header=1)
    f_out.write(header_line+os.linesep)

  # Skip over records - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  if (first_rec > 0):
    for i in range(first_rec):
      f_in.readline()

  # Read lines, process them and write into output files  - - - - - - - - - - -
  #
  line_read = 0  # Number of read lines

  while (line_read < num_rec):  # Loop until 'num_rec' records processed
    line = f_in.readline()

    # Print process indicator message
    #
    if (config.proc_ind >= 0) and (line_read > 0):  # Only print if activated
      if (line_read % config.proc_ind == 0):
        print 'Processed line', line_read, 'of', num_rec

    line = line.strip()  # Remove line separators
    config.curr_line = line  # Make a copy of the unprocessed current line

    line = line.lower()  # Make all characters lower case

    inout.log_message(['Record '+str(line_read+first_rec)],'v1')
    config.curr_line_no = line_read+first_rec  # Store current line number

    # Process line and extract content into components (name, geocode, etc.)
    #
    [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \
           inout.process_line(line)

    # Make a local empty working copy of the output field dictionary  - - - - -
    #
    output_fields = config.output_field.copy()
    output_fields_keys = output_fields.keys()
    for k in output_fields_keys:
      output_fields[k] = ''  # Set all fields to an empty string

    # Standardise name component  - - - - - - - - - - - - - - - - - - - - - - -
    #
    if (type(name_comp) == types.ListType):  # Givenname and surname separate

      givenname_comp = name_comp[0].strip()
      surname_comp   = name_comp[1].strip()

      if (givenname_comp != ''):  # There is a givenname  - - - - - - - - - - -

        inout.log_message('  Givenname component: |'+givenname_comp+'|','v1')

        givenname_comp = name.clean_name_component(givenname_comp)
        [name_list, tag_list] = name.tag_name_component(givenname_comp)
        output_fields['gender_guess'] = name.get_gender_guess(name_list, \
                                        tag_list)
        [name_list, tag_list, output_fields['title']] = \
                                         name.get_title(name_list, tag_list)

        [output_fields['givenname'], output_fields['alt_givenname']] = \
                       name.get_name_component(name_list, tag_list, 'gname')

      if (surname_comp != ''):  # There is a surname  - - - - - - - - - - - - -

        inout.log_message('  Surname component: |'+surname_comp+'|','v1')

        surname_comp = name.clean_name_component(surname_comp)
        [name_list, tag_list] = name.tag_name_component(surname_comp)
        [output_fields['surname'], output_fields['alt_surname']] = \
                        name.get_name_component(name_list, tag_list, 'sname')

    elif (name_comp.strip() != ''):  # Given- and surname both in one field - -

      inout.log_message('  Name component: |'+name_comp+'|','v1')

      name_comp = name.clean_name_component(name_comp)
      [name_list, tag_list] = name.tag_name_component(name_comp)

      output_fields['gender_guess'] = name.get_gender_guess(name_list,tag_list)

      [name_list, tag_list, output_fields['title']] = \
                                        name.get_title(name_list, tag_list)

      if (config.name_standard_method == 'rules'):
        name_dict = name.get_names_rules(name_list, tag_list, 'gname')

      elif (config.name_standard_method == 'hmm'):
        name_dict = name.get_names_hmm(name_list, tag_list)

      else:
        inout.log_message('Illegal name standardisation method:'+ \
                          config.name_standard_method,'err')
        raise Exception()

      for (field,value) in name_dict.items(): # Assign to output dictionary
          output_fields[field] = value 

    # Standardise geocode and locality components using HMM - - - - - - - - - -
    #
    if (config.geoloc_standard_method == 'hmm') and \
       ((geocode_comp.strip() != '') or (locality_comp.strip() != '')):

      geoloc_comp = geocode_comp.strip()+' '+locality_comp.strip()
      inout.log_message('  Geocode and locality component: |'+geoloc_comp+'|',\
                        'v1')

      geoloc_comp = locality.clean_geoloc_component(geoloc_comp)
      [geoloc_words, geoloc_tags] = locality.tag_geoloc_component(geoloc_comp)

      if (geoloc_words != []):  # Component not empty, do HMM standardisation

        geoloc_dict = locality.get_geoloc_hmm(geoloc_words,geoloc_tags)

        for (field,value) in geoloc_dict.items(): # Assign to output dictionary
          output_fields[field] = value

    # Standardise geocode component using rules - - - - - - - - - - - - - - - -
    #
    elif (config.geoloc_standard_method == 'rules') and \
         (geocode_comp.strip() != ''):
      inout.log_message('  Geocode component: |'+geocode_comp+'|','v1')

      ### TO BE DONE
      inout.log_message('Rules based standardisation for geocode is' + \
                        'not implemented yet','err')
      raise Exception()

    # Standardise locality component using rules  - - - - - - - - - - - - - - -
    #
    elif (config.geoloc_standard_method == 'rules') and \
         (locality_comp.strip() != ''):
      inout.log_message('  Locality component: |'+locality_comp+'|','v1')

      ### TO BE FINALISED
      inout.log_message('Rules based standardisation for locality is' + \
                        'not implemented yet','err')
      raise Exception()

#      locality_comp = locality.clean_geoloc_component(locality_comp)
#      [loc_words, loc_tags] = locality.tag_geoloc_component(locality_comp)
#
#      [terr,loc_words2,loc_tags2] = locality.get_territory(loc_words,loc_tags)
#      if (terr != ''):
#        output_fields['territory'] = terr
#
#      [pc,loc_words3,loc_tags3] = locality.get_postcode(loc_words2,loc_tags2)
#      if (pc != ''):
#        output_fields['postcode'] = pc
#
#      [loc_name, loc_quali, loc_words4, loc_tags4] = \
#         locality.get_localityname_qualifier(loc_words3, loc_tags3)
#      if (loc_name != ''):
#        output_fields['locality_name'] = loc_name
#      if (loc_quali != ''):
#        output_fields['locality_quali'] = loc_quali
#
#      if (loc_words4 != []):  # Not all words are standardised yet
#        print '  # Remaining word list:', loc_words4  ###### TEST
#        print '  # Remaining tag list: ', loc_tags4   ###### TEST

    # Standardise date strings  - - - - - - - - - - - - - - - - - - - - - - - -
    #
    if (date1_comp != ''):
      inout.log_message('  Date1 component: |'+date1_comp+'|','v1')

      [day1,month1,year1,status1] = date.parse_datestr(date1_comp)
      if (day1 != -1):
        output_fields['day1'] = str(day1)
      if (month1 != -1):
        output_fields['month1'] = str(month1)
      if (year1 != -1):
        output_fields['year1'] = str(year1)

    if (date2_comp != ''):
      inout.log_message('  Date2 component: |'+date2_comp+'|','v1')

      [day2,month2,year2,status2] = date.parse_datestr(date2_comp)
      if (day2 != -1):
        output_fields['day2'] = str(day2)
      if (month2 != -1):
        output_fields['month2'] = str(month2)
      if (year2 != -1):
        output_fields['year2'] = str(year2)

    # Create log message of output fields - - - - - - - - - - - - - - - - - - -
    #
    msg = ['  Standardised record output fields:']
    for (field,value) in output_fields.items():
      if (value != '') and (value != []):
        msg.append('    '+field+':'+str(value))
    inout.log_message(msg,'v1')

    # Save standardised record into output field
    #
    out_line = inout.compose_line(output_fields)
    f_out.write(out_line+os.linesep)

    # Increment line counter and go to beginning of loop  - - - - - - - - - - -
    #
    line_read += 1

    inout.log_message('','v1')  # Print empty lines between records

  # Close files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  f_in.close()
  f_out.close()

  msg = ['','Number of warnings: '+str(config.num_warning), \
         'Number of corrected word spillings: '+str(config.num_word_spills)]
  inout.log_message(msg,'v1')

  print msg[1]
  print msg[2]

  inout.log_message('End.','v1')
              [('title', 'TI'), ('givenname', 'SN'), ('surname', 'SN')],
              [('givenname', 'GM'), ('surname', 'SN')],
              [('title', 'TI'), ('givenname', 'GF'), ('surname', 'SN')],
              [('title', 'TI'), ('surname', 'SN'), ('givenname', 'GM')],
              [('surname', 'UN'), ('givenname', 'UN')],
              [('givenname', 'GF'), ('surname', 'GF'), ('surname', 'SN')]]

# Some test examples (observation (tag) sequences), one per line

test_data = [['TI', 'GM', 'SN'], ['UN', 'SN'], ['TI', 'UN', 'UN'],
             ['TI', 'GF', 'UN'], ['UN', 'UN', 'UN', 'UN'],
             ['TI', 'GM', 'UN', 'SN'], ['GF', 'UN']]

# Initialise a new HMM and train it

test_hmm = simplehmm.hmm('Test HMM', test_hmm_states, test_hmm_observ)
test_hmm.train(train_data)  # Train the HMM

test_hmm.check_prob()  # Check its probabilities
test_hmm.print_hmm()  # Print it out

# Apply the Viterbi algorithm to each sequence of the test data

for test_rec in test_data:
    [state_sequence, sequence_probability] = test_hmm.viterbi(test_rec)

# Initialise and train a second HMM using the same training data and
# applying Laplace smoothing

test_hmm2 = simplehmm.hmm('Test HMM 2', test_states, test_observ)
test_hmm2.train(train_data, smoothing='laplace')
Exemplo n.º 15
0
import math
import simplehmm
lnc_hmm = simplehmm.hmm('LncRNA',  ['dummy'], ['dummy'])
lnc_hmm.load_hmm('hmm_treinamento_lnc_sem_sorter_threshold.hmm')
lnc_hmm.print_hmm()  # Print it out
print math.log(10)
Exemplo n.º 16
0
import simplehmm
hmm_teste = simplehmm.hmm("mrna",['nada'],['nada'])
hmm_teste.load_hmm('mRNAsTRAIN.hmm')
hmm_teste.print_hmm()
hmm_teste.load_hmm('randomseq.hmm')
hmm_teste.print_hmm()
'''TREINAMENTO

'''


#cirRNA_hmm= simplehmm.hmm('cirRNA_hmm_primeiro_teste',estados,emissoes)
#cirRNA_hmm.train(entrada_treinamento, smoothing='absdiscount')
#cirRNA_hmm.save_hmm("circ.hmm")
#print cirRNA_hmm.print_hmm()



'''VALIDACAO'''

cirRNA_hmm=simplehmm.hmm('circ_rna primeiros testes',['dummy'], ['dummy'])
cirRNA_hmm.load_hmm('circ.hmm')
#print query[1]
#print cirRNA_hmm.print_hmm()


print len(query[0]),'size'
for query_out in query:
    print query_out



print cirRNA_hmm.viterbi(query[0])[1]


print "------------------------------------------"
Exemplo n.º 18
0
                     'locql','pc','ter1','ter2','cntr1','cntr2','rubb']
geoloc_hmm_obser  = ['PC','N4','NU','AN','TR','CR','LN','ST','IN','IT', \
                     'LQ','WT','WN','UT','HY','SL','CO','VB','PA','UN', \
                     'RU']

# =============================================================================
# Dictionary of month name abbreviations (used in date.str2date() routine)

month_abbrev_dict = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6, \
                     'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec': 12}

# =============================================================================
# If Hidden Markov Model standardisation methods are activate load HMM(s)

if (project.name_standard_method == 'hmm'):
  name_hmm = simplehmm.hmm([],[])  # Create new empty HMM object
  name_hmm.load_hmm(project.name_hmm_file_name)

if (project.geoloc_standard_method == 'hmm'):
  geoloc_hmm = simplehmm.hmm([],[])  # Create new empty HMM object
  geoloc_hmm.load_hmm(project.geoloc_hmm_file_name)

# =============================================================================
# List of all supported data file types
#
# File type names must have a length of 3 characters, or 4 characters if the
# file type is quoted (in which case the last character must be a 'Q')
#
# Currently supported file types are:
#   CSV  - Comma separated values, fields separated by commas
#   CSVQ - Comma separated values, where each field starts and ends with
Exemplo n.º 19
0
def trainhmm():
    """Main routine, open file, read lines, train HMM and save it to file.

  USAGE:
    trainhmm()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

    # Process command line arguments and check for correctness  - - - - - - - - -
    #
    if (len(config.options) < 3):
        print '***** Error: %s needs at least four arguments:' % (sys.argv[0])
        print '*****        - Name of the project module'
        print '*****        - Tagging mode: "name" or "locality"'
        print '*****        - Input training file name'
        print '*****        - HMM output file name'
        print '*****          plus options'
        raise Exception()

    if (config.options[1] == config.options[2]):
        print '*** Error: Input and output files must differ'
        print '***        Input training file name:', config.options[1]
        print '***        HMM output file name:    ', config.options[1]
        raise Exception()

    in_file_name = config.options[1]
    hmm_file_name = config.options[2]

    # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - -
    #
    tag_mode = config.options[0]
    if (tag_mode in ['name', 'na', 'n']):
        tag_mode = 'name'
    elif (tag_mode in ['locality', 'lolty', 'loc', 'l']):
        tag_mode = 'loc'
    else:
        print '***** Error: Illegal tagging mode:', tag_mode
        print '*****        Must be either "name" or "locality"'
        raise Exception()

    # Check for optional arguments and process if any - - - - - - - - - - - - - -
    #
    config.verbose = 0  # Default: No verbose output
    config.logging = 0  # Default: No logging into a file
    smoothing = None  # Default: No smoothing
    config.nowarn = 0  # Deactivate no warning flag (print/log warning
    # messages)

    if (len(config.options) > 3):
        options = config.options[3:]
        while (options != []):  # Do a loop processing all options

            if (options[0] == '-nowarn'):
                config.nowarn = 1  # Activate no warning flag
                options = options[1:]  # Remove processed '-nowarn' option

            elif (options[0] == '-v1'):
                config.verbose = 1  # Set to verbose output level 1
                options = options[1:]  # Remove processed '-v1' option

            elif (options[0] == '-v2'):
                config.verbose = 2  # Set to verbose output level 2
                options = options[1:]  # Remove processed '-v2' option

            elif (options[0] == '-l'):
                config.logging = 1
                if (len(options) > 1):
                    if (options[1][0] !=
                            '-'):  # Not another option, must be a file name
                        config.log_file = options[1]  # Get name of log file
                        options = options[1:]  # Remove file_name
                options = options[1:]  # Remove processed -'l' option only

                try:
                    f_log = open(config.log_file,
                                 'a')  # Test if file is appendable
                except:
                    print '***** Error ********************',
                    print '***** Cannot write to log file: ' + config.log_file
                    raise IOError()

                # Write (append) header to log file
                #
                f_log.write(os.linesep)
                f_log.write(
                    '##################################################')
                f_log.write("############" + os.linesep)
                f_log.write("#" + os.linesep)
                f_log.write(
                    "# 'pyTrainHMM.py - Version 0.1' process started at: ")
                f_log.write(time.ctime(time.time()) + os.linesep)
                f_log.write("#" + os.linesep)
                f_log.write("# Input file name: " + in_file_name + os.linesep)
                f_log.write("# HMM file name:   " + hmm_file_name + os.linesep)
                f_log.write(os.linesep)
                f_log.close()

            elif (options[0] == '-s'):
                smoothing = 1  # Set to do a HMM smoothing
                smoothing = options[1]
                if (smoothing in ['l', 'la', 'lap', 'laplac', 'laplace']):
                    smoothing = 'laplace'
                elif (smoothing in ['a','ad','abs','absd','absdis','absdisc',\
                       'absdiscount']):
                    smoothing = 'absdiscount'
                else:  # Illegal value
                    print "*** Error: Illegal value for 'smoothing' argument:", smoothing
                    print "***        Possible are: 'laplace' or 'absdiscount'"
                    raise Exception()

                options = options[2:]  # Remove processed option

            else:
                print '*** Error: Illegal option:', options[0]
                raise Exception()

    # Get HMM states and observations from configuration module - - - - - - - - -
    #
    if (tag_mode == 'name'):
        state_list = config.name_hmm_states
        obser_list = config.name_hmm_obser

    else:
        state_list = config.geoloc_hmm_states
        obser_list = config.geoloc_hmm_obser

    # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    try:
        f_in = open(in_file_name, 'r')
    except:
        inout.log_message('Cannot open input file: ' + in_file_name, 'err')
        raise IOError()

    line_count = 0  # Counter for lines read
    rec_count = 0  # Counter for training records read

    # Read lines, discard comment lines and process training data lines - - - - -
    #
    training_data = []  # List of training records

    train_list = [
    ]  # List of training sequences (dictionaries), extracted from
    # training data

    for line in xreadlines.xreadlines(f_in):

        if (line[0] != '#') and (line.strip() != ''):
            # Line must contain a training record

            line = line.strip()  # Remove line separators
            config.curr_line = line  # Make a copy of the unprocessed current line

            line_list = line.split(',')  # Split into a list of elements
            line_data = []  # Training data list for one training record

            inout.log_message(['Record number: ' + str(rec_count)], 'v1')
            config.curr_line_no = line_count  # Store current line number

            for elem in line_list:
                [k, v] = elem.split(':')  # Split into key and value
                tag = k.strip()
                state = v.strip()
                line_data.append((state, tag))

                if (state not in state_list):
                    msg = ['Illegal state name in training record: '+state, \
                           'Line: '+str(line_count)+', record: '+str(rec_count), \
                           'Possible values: '+str(state_list)]
                    inout.log_message(msg, 'err')
                    raise Exception()

                if (tag not in obser_list):
                    msg = ['Illegal observation (tag) name in training record: '+tag, \
                           'Line: '+str(line_count)+', record: '+str(rec_count), \
                           'Possible values: '+str(obser_list)]
                    inout.log_message(msg, 'err')
                    raise Exception()

            inout.log_message('  Training record '+str(rec_count)+':'+ \
                              str(line_data),'v1')

            train_list.append(line_data)

            rec_count += 1
            inout.log_message('', 'v1')  # Print empty lines between records

        line_count += 1

    # Close input file  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    f_in.close()

    inout.log_message('', 'v1')  # Print empty lines between records

    # Initalise HMM and train it with training data - - - - - - - - - - - - - - -
    #
    myhmm = simplehmm.hmm(state_list, obser_list)

    myhmm.train(train_list, smoothing)
    myhmm.print_hmm()

    # Save trained HMM  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    myhmm.save_hmm(hmm_file_name)

    inout.log_message(['Read '+str(line_count)+' lines, processed '+ \
                      str(rec_count)+' training records', 'End.'],'v1')
print 'Set of tags found in HMM training file:'
print '  %s' % (', '.join(tag_list))
print
print 'Set of HMM states found in HMM training file:'
print '  %s' % (', '.join(state_list))
print

print 'Parsed %d training records:' % (len(train_rec_list))
for train_rec in train_rec_list:
    print '  %s' % (train_rec)
print

# Initalise HMM and train it with training data - - - - - - - - - - - - - - - -
#
hmm_name = 'Febrl HMM based on training file "%s"' % (hmm_training_file)
hmm_states = list(state_set)
hmm_observ = list(tag_set)

train_hmm = simplehmm.hmm(hmm_name, hmm_states, hmm_observ)

# Train, print and save the HMM - - - - - - - - - - - - - - - - - - - - - - - -
#
train_hmm.train(train_rec_list, hmm_smoothing)
train_hmm.print_hmm()

# Save trained HMM  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#
train_hmm.save_hmm(hmm_model_file)

# =============================================================================
Exemplo n.º 21
0
def tagdata():
    """Main routine, open file, read lines, tag data records, write to out-file.

  USAGE:
    tagdata()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

    # Process command line arguments and check for correctness  - - - - - - - - -
    #
    if (len(config.options) < 5):
        print '***** Error: %s needs at least six arguments:' % (sys.argv[0])
        print '*****        - Name of the project module'
        print '*****        - Tagging mode: "name" or "locality"'
        print '*****        - Output training file name'
        print '*****        - Start of block with training records'
        print '*****        - End of block with training records'
        print '*****        - Number of training records'
        print '*****          plus options'
        raise Exception()

    if (config.in_file_name == config.options[2]):
        print '***** Error: Input and output files must differ'
        print '*****        Input file name:          ', config.in_file_name
        print '*****        Output training file name:', config.options[2]
        raise Exception()

    first_rec = int(config.options[2])
    last_rec = int(config.options[3])
    num_rec = int(config.options[4])
    in_file_name = config.in_file_name
    out_file_name = config.options[1]

    # Check record number values  - - - - - - - - - - - - - - - - - - - - - - - -
    #
    if (int(first_rec) >= int(last_rec)) or \
       ((int(num_rec)-1) > (int(last_rec)-int(first_rec))):
        print '***** Error: Illegal values for training records block:'
        print '*****        - Start of block with training records:', first_rec
        print '*****        - End of block with training records:  ', last_rec
        print '*****        - Number of training records:          ', num_rec
        raise Exception()

    rec_range = last_rec - first_rec - 1  # Range of records in input file

    # Open input file and check number of available records - - - - - - - - - - -
    #
    try:
        f_in = open(in_file_name, 'r')
    except:
        inout.log_message('Cannot open input file: ' + in_file_name, 'err')
        raise IOError()

    line_count = 0
    for line in f_in.xreadlines():
        line_count += 1
    f_in.close()

    if (last_rec > line_count):  # Illegal value for last record
        print '***** Error: Illegal values for last training records:', last_rec
        print '*****        File only contains', line_count, 'lines/records'
        raise Exception()

    # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - -
    #
    tag_mode = config.options[0]
    if (tag_mode in ['name', 'na', 'n']):
        tag_mode = 'name'
    elif (tag_mode in ['locality', 'localty', 'loc', 'l']):
        tag_mode = 'loc'
    else:
        print '***** Error: Illegal tagging mode:', tag_mode
        print '*****        Must be either "name" or "locality"'
        raise Exception()

    # Check for optional arguments and process if any - - - - - - - - - - - - - -
    #
    config.verbose = 0  # Default: No verbose output
    config.logging = 0  # Default: No logging into a file
    hmm_file_name = None  # Default: Do not use HMM to standardise training
    #          records
    retag_file_name = None  # Default: Do not retag an existing training file
    config.nowarn = 0  # Deactivate no warning flag (print/log warning
    # messages)
    freqs_file_name = None  # Default: Do not write frequencies, no -freqs option

    if (len(config.options) > 5):
        options = config.options[5:]
        while (options != []):  # Do a loop processing all options

            if (options[0] == '-nowarn'):
                config.nowarn = 1  # Activate no warning flag
                options = options[1:]  # Remove processed '-nowarn' option

            elif (options[0] == '-v1'):
                config.verbose = 1  # Set to verbose output level 1
                options = options[1:]  # Remove processed '-v1' option

            elif (options[0] == '-v2'):
                config.verbose = 2  # Set to verbose output level 2
                options = options[1:]  # Remove processed '-v2' option

            elif (options[0] == '-l'):
                config.logging = 1
                if (len(options) > 1):
                    if (options[1][0] !=
                            '-'):  # Not another option, must be a file name
                        config.log_file = options[1]  # Get name of log file
                        options = options[1:]  # Remove file_name
                options = options[1:]  # Remove processed -'l' option only

                try:
                    f_log = open(config.log_file,
                                 'a')  # Test if file is appendable
                except:
                    print '***** Error ********************',
                    print '***** Cannot write to log file: ' + config.log_file
                    raise IOError()

                # Write (append) header to log file
                #
                f_log.write(os.linesep)
                f_log.write(
                    '##################################################')
                f_log.write('############' + os.linesep)
                f_log.write('#' + os.linesep)
                f_log.write(
                    "# 'pyTagData.py - Version 0.1' process started at: ")
                f_log.write(time.ctime(time.time()) + os.linesep)
                f_log.write('#' + os.linesep)
                f_log.write("# Input file name:  " + in_file_name + os.linesep)
                f_log.write("# Output file name: " + out_file_name +
                            os.linesep)
                f_log.write("# Tagging mode:     " + tag_mode + os.linesep)
                f_log.write(os.linesep)
                f_log.close()

            elif (options[0] == '-hmm'):
                hmm_file_name = options[1]  # Get file name of the HMM to use
                if (hmm_file_name == out_file_name):
                    print '***** Error: HMM file name is the same as output file name!'
                    raise Exception()

                try:
                    f_in = open(hmm_file_name,
                                'r')  # Test if file is available
                except:
                    print '***** Error: Cannot open HMM file specified in "-hmm"',
                    print 'option:', hmm_file_name
                    raise IOError()
                f_in.close()
                options = options[
                    2:]  # Remove processed '-hmm' option and file name

            elif (options[0] == '-retag'):
                if (hmm_file_name == None) and ('-hmm' not in options):
                    print '***** Error: "-retag" option can only be used together with',
                    print '"-hmm" option (which is not given).'
                    raise Exception()

                retag_file_name = options[
                    1]  # Get file name of the already-tagged
                # file to re-process
                if (retag_file_name == out_file_name):
                    print '***** Error: Retag file name is the same as output file name!'
                    raise Exception()
                elif (retag_file_name == in_file_name):
                    print '***** Error: Retag file name is the same as input file name!'
                    raise Exception()
                elif (retag_file_name == hmm_file_name):
                    print '***** Error: Retag file name is the same as HMM file name!'
                    raise Exception()

                try:
                    f_in = open(retag_file_name,
                                'r')  # Test if file is available

                    # Now gather record numbers and previous tags/states, as well as the
                    # original header information. Use a simple state machine to do this.
                    #
                    tagged_recs = {}
                    cleaned_recs = {}
                    original_header_lines = []
                    state = -1  # Header lines state
                    prevline = ''

                    for line in f_in.xreadlines(
                    ):  # Read training file and process it
                        line = line.strip()

                        if (state == -1) and (len(line)
                                              == 0):  # End of header lines
                            state = 0
                            prevline = line
                            continue

                        if (state == -1) and (len(line) > 0) and (line[0]
                                                                  == "#"):
                            original_header_lines.append("# " + line)
                            prevline = line
                            continue
                        sline = line.split(' ')

                        if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \
                           and (sline[2][0] == '(') and (sline[2][-2:] == '):'):
                            try:
                                rec = int(sline[1])  # Original record number
                                tagged_recs[rec] = None
                                cleaned_recs[rec] = None
                                state = 1
                            except:
                                pass
                            prevline = line
                            continue

                        if (state
                                == 1) and (len(line) > 0) and (line[0] != '#'):
                            tagged_recs[rec] = line
                            cleaned_recs[rec] = prevline
                            state = 0
                            prevline = line
                            continue

                        if (state == 1) and (len(line) > 0):
                            prevline = line
                            continue

                    f_in.close()
                    tagged_recs_keys = tagged_recs.keys()

                    num_rec = len(
                        tagged_recs_keys)  # Override specified numbers
                    first_rec = 0
                    last_rec = line_count

                except:
                    print '***** Error: Cannot open tagged training file specified',
                    print 'in "-retag" option:', retag_file_name
                    raise IOError()

                options = options[
                    2:]  # Remove processed '-retag' option and file name

            elif (options[0][:5] == '-freq'):
                if (hmm_file_name == None) and ('-hmm' not in options):
                    print '***** Error: "-feqs" option can only be used together with',
                    print '"-hmm" option (which is not given).'
                    raise Exception()

                freqs_file_name = options[
                    1]  # File name to write the frequencies to
                if (freqs_file_name == out_file_name):
                    print '***** Error: Frequency file name is the same as output',
                    print 'file name!'
                    raise Exception()
                elif (freqs_file_name == in_file_name):
                    print '***** Error: Frequency file name is the same as input',
                    print 'file name!'
                    raise Exception()
                elif (freqs_file_name == hmm_file_name):
                    print '***** Error: Frequency file name is the same as HMM',
                    print 'file name!'
                    raise Exception()

                options = options[
                    2:]  # Remove processed '-freqs' option and file name
                try:  # Check if file writing is possible
                    freqs_out = open(freqs_file_name, 'w')
                    freqs_out.close()
                except:
                    print '***** Error: Cannot write to frequency output file specified',
                    print 'in "-freqs" option:', freqs_file_name
                    raise IOError()

            else:
                print '***** Error: Illegal option:', options[0]
                raise Exception()

    # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - -
    #
    if (hmm_file_name != None):
        myhmm = simplehmm.hmm([], [])  # Create new empty HMM object
        myhmm.load_hmm(hmm_file_name)
        myhmm.print_hmm()  # Print HMM (according to verbose and logging level)

    # Open output file and write header - - - - - - - - - - - - - - - - - - - - -
    #
    try:
        f_out = open(out_file_name, 'w')
    except:
        inout.log_message('Cannot open output file: ' + out_file_name, 'err')
        raise IOError()

    f_out.write("# Tagged training data written by 'pyTagData.py -"+ \
                " Version 0.1'"+os.linesep)
    f_out.write('#' + os.linesep)
    f_out.write('# Created ' + time.ctime(time.time()) + os.linesep)
    f_out.write('#' + os.linesep)
    f_out.write('# Input file name:  ' + in_file_name + os.linesep)
    f_out.write('# Output file name: ' + out_file_name + os.linesep)
    f_out.write('#' + os.linesep)
    f_out.write('# Parameters:' + os.linesep)
    f_out.write('# - Start of block with training records: '+str(first_rec)+ \
                os.linesep)
    f_out.write('# - End of block with training records:   '+str(last_rec)+ \
                os.linesep)
    f_out.write('# - Number of training records:           '+str(num_rec)+ \
                os.linesep)
    if (hmm_file_name != None):
        f_out.write('#' + os.linesep)
        f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \
                    os.linesep)
    if (retag_file_name != None):
        f_out.write('#' + os.linesep)
        f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \
                    os.linesep)
        f_out.write("#   Header lines from original training file follow:" + \
                    os.linesep)
        for header_line in original_header_lines:
            f_out.write(header_line + os.linesep)
    if (freqs_file_name != None):
        f_out.write('#' + os.linesep)
        f_out.write("# - Tag/state pattern frequencies written to file '" + \
                    freqs_file_name + os.linesep)
    f_out.write('#' + '-' * 70 + os.linesep)
    f_out.write(os.linesep)

    rec_count = 0  # Number of selected records
    num_rec_left = num_rec  # Number of records to be selected left
    rec_selected = {}  # Dictionary of all record numbers that were selected
    seq_freqs = {}  # Dict to hold examples of tag/state patterns

    unchanged_loop_cnt = 0  # Counter of how many loops have been done
    # without new records being selected
    prev_num_rec_left = num_rec  # Number of records left in the previous
    # interation

    # Due to the random nature of selecting records, and because sometimes  - - -
    # a selected component can be empty (and is thus not used for training)
    # more than one iteration over the input data set is carried out. In each
    # iteration, records are selected randomly.
    #
    while (rec_count < num_rec):  # Loop until 'num_rec' records selected

        # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        try:
            f_in = open(in_file_name, 'r')
        except:
            inout.log_message('Cannot open input file: ' + in_file_name, 'err')
            raise IOError()

        line_read = 0  # Number of read lines

        # Skip to start of training block - - - - - - - - - - - - - - - - - - - - -
        #
        if (first_rec > 0):
            for i in range(first_rec):
                f_in.readline()

        while (rec_count < num_rec) and (line_read <= (last_rec - first_rec)):
            line = f_in.readline()

            if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \
               ((retag_file_name == None) and \
                (num_rec_left >= random.randrange(0,rec_range,1))):

                line = line.strip()  # Remove line separators
                config.curr_line = line  # Make a copy of the unprocessed current line

                line = line.lower()  # Make all characters lower case

                inout.log_message(
                    ['Record number: ' + str(line_read + first_rec)], 'v1')
                config.curr_line_no = line_read + first_rec  # Store current line number

                # Process line and extract content into components (name, geocode, etc)
                #
                [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \
                   inout.process_line(line)

                # Select component and process it - - - - - - - - - - - - - - - - - - -
                #
                if (tag_mode == 'name'):
                    if (type(name_comp) == types.ListType):
                        component = name_comp[0].strip(
                        ) + ' ' + name_comp[1].strip()
                    else:
                        component = name_comp.strip()
                else:  # Locality component
                    component = geocode_comp.strip(
                    ) + ' ' + locality_comp.strip()

                if (component != '') and \
                   (not rec_selected.has_key((line_read+first_rec))):

                    if (tag_mode == 'name'):
                        inout.log_message(
                            '  Name component: |' + component + '|', 'v1')

                        component = name.clean_name_component(component)
                        [word_list,
                         tag_list] = name.tag_name_component(component)

                    else:  # Locality component
                        inout.log_message(
                            '  Locality component: |' + component + '|', 'v1')

                        component = locality.clean_geoloc_component(component)
                        [word_list,
                         tag_list] = locality.tag_geoloc_component(component)

                    if (tag_list != []):  # Only process non-empty tag lists

                        # Append record number into dictionary of processed records
                        #
                        rec_selected.update({
                            (line_read + first_rec): (line_read + first_rec)
                        })

                        # Create all permutation sequences of this tag list - - - - - - - -
                        #
                        tag_seq = mymath.perm_tag_sequence(tag_list)

                        inout.log_message(['  Word list: '+str(word_list), \
                                           '  Tag list: '+str(tag_list), \
                                           '  Tag sequences:'],'v2')

                        # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - -
                        #
                        if (hmm_file_name != None):

                            state_seq = [
                            ]  # List containing computed HMM state sequences
                            max_prob = -1.0  # maximal probability for a sequence
                            max_seq_no = -1  # Number of the seq. with the max. probablity

                            # Now give tag sequences to the HMMs to compute state sequences
                            #
                            i = 0
                            for t in tag_seq:
                                [obs_seq, prob] = myhmm.viterbi(t)
                                state_seq.append(obs_seq)
                                if (prob > max_prob):
                                    max_prob = prob
                                    max_seq_no = i
                                i += 1

                        # Write original component and resulting tag sequences to output
                        #
                        f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \
                                    '): |'+component+'|'+os.linesep) # Commented original
                        num_len = len(str(line_read + first_rec)) + len(
                            str(rec_count)) + 6

                        f_out.write('#' + num_len * ' ' + '|' +
                                    ' '.join(word_list) + '|' + os.linesep)

                        for i in range(len(tag_seq)):
                            # Convert each tag sequence into a string for file output
                            #
                            seq_string = '  '

                            if (hmm_file_name != None) and (i != max_seq_no):
                                seq_string = '# '  # Comment sequences with not max. probability

                            for j in range(len(tag_seq[i])):

                                if (hmm_file_name != None):
                                    seq_string = seq_string+' '+tag_seq[i][j]+':'+ \
                                                 state_seq[i][j]+','
                                else:
                                    seq_string = seq_string + ' ' + tag_seq[i][
                                        j] + ':,'

                            f_out.write(seq_string[:-1] +
                                        os.linesep)  # Write without , at end
                            inout.log_message('    ' + seq_string[:-1], 'v2')

                        if (hmm_file_name != None):
                            f_out.write('# Maximum Viterbi probability: %0.5f'% \
                                        (max_prob) + os.linesep)
                            inout.log_message('Maximum Viterbi probability: %0.5f'% \
                                              (max_prob), 'v2')

                        if (retag_file_name !=
                                None) and (tagged_recs[line_read] != None):
                            if (tagged_recs[line_read].strip() !=
                                    seq_string[:-1].strip()):
                                f_out.write("# Note: ***** Changed *****" +
                                            os.linesep)
                                inout.log_message('                      Note:' + \
                                                  ' ***** Changed *****','v2')
                                f_out.write('# Was: ' +
                                            tagged_recs[line_read] +
                                            os.linesep)
                                # Write commented original tag sequence
                                inout.log_message('Original tag sequence: '+ \
                                                  tagged_recs[line_read],'v2')

                        f_out.write(os.linesep)  # Write an empty line
                        inout.log_message(
                            '', 'v1')  # Print empty lines between records

                        if (hmm_file_name != None):
                            seq_key = seq_string[:
                                                 -1]  # Add sequence to dictionary
                            if (seq_freqs.has_key(seq_key)):
                                seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \
                                                          max_prob])
                            else:
                                seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \
                                                      max_prob]]

                        rec_count += 1

                        # Print process indicator message
                        #
                        if (config.proc_ind >= 0) and (rec_count > 0):
                            if (rec_count % config.proc_ind == 0):
                                print 'Processed line', rec_count, 'of', num_rec

            line_read += 1

        f_in.close()

        num_rec_left = num_rec - rec_count

        if (prev_num_rec_left == num_rec_left):  # No new records selected
            unchanged_loop_cnt += 1
        prev_num_rec_left = num_rec_left  # Set to current value

        if (unchanged_loop_cnt > 5):  # Do five loops maximal without selecting
            # new records
            config.curr_line_no = -1  # Set to illegal/empty values, as warning is
            config.curr_line = ''  # not related to the current input line
            inout.log_message(['Can not select more than '+str(rec_count)+ \
                               ' records for training.', \
                               'This is probably due to empty input components.', \
                               'Please reduce value of "num_rec" or increase ' + \
                               'range','between "first_rec" and "last_rec".'],'warn')
            break

        if (num_rec_left < 10):  # Only 10 records left to select
            num_rec_left = num_rec + 1  # Set to more than 100% probablity
        elif (num_rec_left < (num_rec / 100.0)):  # Less than 1% records left
            num_rec_left = int(num_rec / 100.0)  # Set to 1%

    f_out.close()

    # If specified, save Viterbi frequencies to a file  - - - - - - - - - - - - -
    #
    if (freqs_file_name != None):
        freqs_out = open(freqs_file_name,
                         'w')  # Open frequency file for writing
        freqs_out.write('# Frequency listing of tag/state patterns written by')
        freqs_out.write('"pyTagData.py - Version 0.1"' + os.linesep)
        freqs_out.write('#' + os.linesep)
        freqs_out.write('# Created ' + time.ctime(time.time()) + os.linesep)
        freqs_out.write('#' + os.linesep)
        freqs_out.write("# Input file name:  " + in_file_name + os.linesep)
        freqs_out.write("# Output file name: " + out_file_name + os.linesep)
        freqs_out.write(os.linesep)
        freqs_out.write('# Parameters:' + os.linesep)
        freqs_out.write('# - Start of block with training records: '+ \
                        str(first_rec)+os.linesep)
        freqs_out.write('# - End of block with training records:   '+ \
                        str(last_rec)+os.linesep)
        freqs_out.write('# - Number of training records:           '+ \
                        str(num_rec)+os.linesep)
        if (hmm_file_name != None):
            freqs_out.write('#' + os.linesep)
            freqs_out.write("# - Using HMM file '"+hmm_file_name+ \
                            "' for standardisation"+os.linesep)
        if (retag_file_name != None):
            freqs_out.write('#' + os.linesep)
            freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \
                            "'"+os.linesep)
        freqs_out.write('#' + '-' * 70 + os.linesep)
        freqs_out.write(os.linesep)

        sorted_seq_freqs = [
        ]  # Now sort sequences according to their fruequencies
        for key in seq_freqs.keys():
            sorted_seq_freqs.append((len(seq_freqs[key]), key))
        sorted_seq_freqs.sort()

        for skey in sorted_seq_freqs:
            key = skey[1]
            freqs_out.write('# Pattern: ' + str(key) + os.linesep)
            freqs_out.write('# Frequency: ' + str(skey[0]) + os.linesep)
            examples = seq_freqs[key]
            freqs_out.write('# Maximum Viterbi probability: '+ \
                            str(examples[0][1])+os.linesep)
            freqs_out.write('# Examples: ' + os.linesep)
            for example in examples:
                freqs_out.write('#    ' + str(example[0]) + os.linesep)
            freqs_out.write(str(key) + os.linesep)
            freqs_out.write(os.linesep)
        freqs_out.close()

    inout.log_message(['Read '+str(line_read)+' lines, processed '+ \
                      str(rec_count)+' lines', 'End.'],'v1')
Exemplo n.º 22
0
            treinar.append(train)

    print classificar_vetor
    print(teste.get_dir())

    #teste.set_pontas_porcentagem(15)
    #print(teste.get_pontas())


    #organizar
    #print (organizar[0])
    #print(sorted(organizar)) # esse comando deve ser feito usando o primeiro objeto ZERO e nao UM
    print (sorted(train))
    test_hmm_states = ['1', '3', '2']
    #test_hmm_observ=[]
    test_hmm = simplehmm.hmm('15 porcento lncRNA', test_hmm_states, nomes)
    #print test_hmm_observ
    '''
    temos que colocar todos os arquivos train dentro da tabela com o append por meio de um FOR
    A TABELA DE TREINAMENTO TEM QUE SAIR ORGANIZADA E NaO ESTa
    '''


    #treinar.append(train)
    print '---------------------', treinar, '---------------------------------'
    #print test_hmm.check_prob()
    test_hmm.train(treinar, smoothing='absdiscount')
    #print test_hmm.check_prob()
    print test_hmm.print_hmm()
    if nome_saida =='':
        print ('De um nome para os arquivos de saida')
Exemplo n.º 23
0
                treinamento.append(entrada_vetor)
            #print entrada_vetor
            contador += 1
    if len(treinamento):
        entrada_treinamento.append(treinamento)
'''TREINAMENTO

'''

#cirRNA_hmm= simplehmm.hmm('cirRNA_hmm_primeiro_teste',estados,emissoes)
#cirRNA_hmm.train(entrada_treinamento, smoothing='absdiscount')
#cirRNA_hmm.save_hmm("circ.hmm")
#print cirRNA_hmm.print_hmm()
'''VALIDACAO'''

cirRNA_hmm = simplehmm.hmm('circ_rna primeiros testes', ['dummy'], ['dummy'])
cirRNA_hmm.load_hmm('circ.hmm')
#print query[1]
#print cirRNA_hmm.print_hmm()

print len(query[0]), 'size'
for query_out in query:
    print query_out

print cirRNA_hmm.viterbi(query[0])[1]

print "------------------------------------------"
print cirRNA_hmm.viterbi(query[0])[0]
#print cirRNA_hmm.viterbi(query[0])[0].index('BRANCH'),'<-BRANCH->',query[0][cirRNA_hmm.viterbi(query[0])[0].index('BRANCH')]
for i in range(1, 10):
    shuffle(query[0])
Exemplo n.º 24
0
    def testHMM(
            self):  # - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        """Test basic HMM functionality"""

        hmm1 = simplehmm.hmm("Test HMM", self.states, self.observ)

        assert hmm1.N == len(
            self.states), ("Illegal number of states in HMM (" + str(hmm1.N) +
                           "), should be: " + str(len(self.states)))
        assert len(hmm1.S_ind) == len(self.states), (
            "Illegal number of states in HMM state dictionary (" +
            str(len(hmm1.S_ind)) + "), should be: " + str(len(self.states)))

        assert hmm1.M == len(
            self.observ), ("Illegal number of observations in HMM (" +
                           str(hmm1.M) + "), should be: " +
                           str(len(self.observ)))
        assert len(hmm1.O_ind) == len(self.observ), (
            "Illegal number of observations in HMM observation dictionary (" +
            str(len(hmm1.O_ind)) + "), should be: " + str(len(self.observ)))

        for i in range(hmm1.N):
            assert hmm1.pi[i] == 0.0, (
                "Initial probability in HMM 1 is not 0.0 at location [" +
                str(i) + "]: " + str(hmm1.pi[i]))

            for j in range(hmm1.N):
                assert hmm1.A[i][j] == 0.0, (
                    "Transition probability in HMM 1 is not 0.0 at location ["
                    + str(i) + "," + str(j) + "]: " + str(hmm1.A[i][j]))
            for j in range(hmm1.M):
                assert hmm1.B[i][j] == 0.0, (
                    "Observation probability in HMM 1 is not 0.0 at location ["
                    + str(i) + "," + str(j) + "]: " + str(hmm1.B[i][j]))

        hmm1.train(self.train_data)
        hmm1.check_prob()
        hmm1.print_hmm()

        for i in range(hmm1.N):
            assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), (
                "Initial probability in HMM 1 is not between 0.0 and 1.0 at " +
                "location [" + str(i) + "]: " + str(hmm1.pi[i]))

            for j in range(hmm1.N):
                assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), (
                    "Transition probability in HMM 1 is not between 0.0 and 1.0"
                    + " at location [" + str(i) + "," + str(j) + "]: " +
                    str(hmm1.A[i][j]))
            for j in range(hmm1.M):
                assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), (
                    "Observation probability in HMM 1 is not between 0.0 and "
                    + "1.0 at location [" + str(i) + "," + str(j) + "]: " +
                    str(hmm1.B[i][j]))

        for test_rec in self.test_data:
            [state_seq, seq_prob] = hmm1.viterbi(test_rec)

            for state in state_seq:
                assert state in self.states, ('Returned state "' + state +
                                              '" not in tate list')
            assert (seq_prob >= 0.0) and (
                seq_prob <=
                1.0), "Sequence probability is not between 0.0 and 1.0:" + str(
                    seq_prob)

        hmm1.train(self.train_data, smoothing="laplace")
        hmm1.check_prob()
        hmm1.print_hmm()

        for i in range(hmm1.N):
            assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), (
                "Initial probability in HMM 1 is not between 0.0 and 1.0 at " +
                "location [" + str(i) + "]: " + str(hmm1.pi[i]))

            for j in range(hmm1.N):
                assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), (
                    "Transition probability in HMM 1 is not between 0.0 and 1.0"
                    + " at location [" + str(i) + "," + str(j) + "]: " +
                    str(hmm1.A[i][j]))
            for j in range(hmm1.M):
                assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), (
                    "Observation probability in HMM 1 is not between 0.0 and "
                    + "1.0 at location [" + str(i) + "," + str(j) + "]: " +
                    str(hmm1.B[i][j]))

        for test_rec in self.test_data:
            [state_seq, seq_prob] = hmm1.viterbi(test_rec)

            for state in state_seq:
                assert state in self.states, ('Returned state "' + state +
                                              '" not in tate list')
            assert (seq_prob >= 0.0) and (
                seq_prob <=
                1.0), "Sequence probability is not between 0.0 and 1.0:" + str(
                    seq_prob)

        hmm1.train(self.train_data, smoothing="absdiscount")
        hmm1.check_prob()
        hmm1.print_hmm()

        for i in range(hmm1.N):
            assert (hmm1.pi[i] >= 0.0) and (hmm1.pi[i] <= 1.0), (
                "Initial probability in HMM 1 is not between 0.0 and 1.0 at " +
                "location [" + str(i) + "]: " + str(hmm1.pi[i]))

            for j in range(hmm1.N):
                assert (hmm1.A[i][j] >= 0.0) and (hmm1.A[i][j] <= 1.0), (
                    "Transition probability in HMM 1 is not between 0.0 and 1.0"
                    + " at location [" + str(i) + "," + str(j) + "]: " +
                    str(hmm1.A[i][j]))
            for j in range(hmm1.M):
                assert (hmm1.B[i][j] >= 0.0) and (hmm1.B[i][j] <= 1.0), (
                    "Observation probability in HMM 1 is not between 0.0 and "
                    + "1.0 at location [" + str(i) + "," + str(j) + "]: " +
                    str(hmm1.B[i][j]))

        for test_rec in self.test_data:
            [state_seq, seq_prob] = hmm1.viterbi(test_rec)

            for state in state_seq:
                assert state in self.states, ('Returned state "' + state +
                                              '" not in tate list')
            assert (seq_prob >= 0.0) and (
                seq_prob <=
                1.0), "Sequence probability is not between 0.0 and 1.0:" + str(
                    seq_prob)

        hmm1.save_hmm("testhmm.hmm")
        hmm2 = hmm1

        hmm2 = simplehmm.hmm("Test2 HMM", ["dummy"], ["dummy"])

        hmm2.load_hmm("testhmm.hmm")

        assert hmm1.N == hmm2.N, "Loaded HMM has differnt number of states"
        assert hmm1.M == hmm2.M, "Loaded HMM has differnt number of observations"

        for i in range(hmm1.N):
            assert abs(hmm1.pi[i] - hmm2.pi[i]) < self.delta, (
                "Initial probability in HMM 1 is different from HMM 2: " +
                str(hmm1.pi[i]) + " / " + str(hmm2.pi[i]))

            for j in range(hmm1.N):
                assert abs(hmm1.A[i][j] - hmm2.A[i][j]) < self.delta, (
                    "Transition probability in HMM 1 is different from HMM 2 "
                    + "at location [" + str(i) + "," + str(j) + "]: " +
                    str(hmm1.A[i][j]) + " / " + str(hmm2.A[i][j]))

            for j in range(hmm1.M):
                assert abs(hmm1.B[i][j] - hmm1.B[i][j]) < self.delta, (
                    "Observation probability in HMM 1 is different from HMM 2 "
                    + "at location [" + str(i) + "," + str(j) + "]: " +
                    str(hmm1.B[i][j]) + " / " + str(hmm2.B[i][j]))
Exemplo n.º 25
0
def standard():
    """Main routine, open file, read lines, standardise them and write into file.

  USAGE:
    standard()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

    # Process command line arguments and check for correctness  - - - - - - - - -
    #
    if (len(config.options) < 2):
        print '***** Error: %s needs at least three arguments:' % (sys.argv[0])
        print '*****        - Name of the project module'
        print '*****        - Number of the first record to be processed'
        print '*****        - Number of records to be processed'
        print '*****          plus options'
        raise Exception()

    first_rec = int(config.options[0])
    num_rec = int(config.options[1])
    in_file_name = config.in_file_name
    out_file_name = config.out_file_name

    # Check for optional arguments and process if any - - - - - - - - - - - - - -
    #
    config.verbose = 0  # Default: No verbose output
    config.logging = 0  # Default: No logging into a file
    write_header = 0  # Write header (output field names) to output file
    # (default: Don't)
    config.nowarn = 0  # Deactivate no warning flag (print/log warning messages)

    if (len(config.options) > 2):
        options = config.options[2:]
        while (options != []):  # Do a loop processing all options

            if (options[0] == '-nowarn'):
                config.nowarn = 1  # Activate no warning flag
                options = options[1:]  # Remove processed '-nowarn' option

            elif (options[0] == '-v1'):
                config.verbose = 1  # Set to verbose output level 1
                options = options[1:]  # Remove processed '-v1' option

            elif (options[0] == '-v2'):
                config.verbose = 2  # Set to verbose output level 2
                options = options[1:]  # Remove processed '-v2' option

            elif (options[0] == '-l'):
                config.logging = 1
                if (len(options) > 1):
                    if (options[1][0] !=
                            '-'):  # Not another option, must be a file name
                        config.log_file = options[1]  # Get name of log file
                        options = options[1:]  # Remove file_name
                options = options[1:]  # Remove processed -'l' option only

                try:
                    f_log = open(config.log_file,
                                 'a')  # Test if file is appendable
                except:
                    print '***** Error ********************',
                    print '***** Cannot write to log file:', config.log_file
                    raise IOError()

                # Write (append) header to log file
                #
                f_log.write(os.linesep)
                f_log.write(
                    '##################################################')
                f_log.write("############" + os.linesep)
                f_log.write("#" + os.linesep)
                f_log.write(
                    "# 'pyStandard.py - Version 0.1' process started at: ")
                f_log.write(time.ctime(time.time()) + os.linesep)
                f_log.write("#" + os.linesep)
                f_log.write("# Input file name:  " + in_file_name + os.linesep)
                f_log.write("# Output file name: " + out_file_name +
                            os.linesep)
                f_log.write(os.linesep)
                f_log.close()

            elif (options[0] == '-h'):
                write_header = 1
                options = options[1:]  # Remove processed -'h' option

            elif (options[0] == '-hmm-name'):
                hmm_name_file = options[
                    1]  # Get file name of the name HMM to use
                try:
                    f_in = open(hmm_name_file,
                                'r')  # Test if file is available
                except:
                    print '***** Error ********************',
                    print '***** Cannot open HMM file in "-hmm-name" option:',
                    print hmm_name_file
                    raise IOError()

                f_in.close()
                options = options[2:]  # Remove processed option and file name
                config.name_standard_method = 'hmm'
                config.name_hmm_file_name = hmm_name_file
                config.name_hmm = simplehmm.hmm(
                    [], [])  # Create new empty HMM object
                config.name_hmm.load_hmm(config.name_hmm_file_name)

            elif (options[0] == '-hmm-loc'):
                hmm_loc_file = options[
                    1]  # Get file name of the locality HMM to use
                try:
                    f_in = open(hmm_loc_file, 'r')  # Test if file is available
                except:
                    print '***** Error ********************',
                    print '***** Cannot open HMM file in "-hmm-loc" option:',
                    print hmm_loc_file
                    raise IOError()
                f_in.close()
                options = options[2:]  # Remove processed option and file name
                config.geoloc_standard_method == 'hmm'
                config.geoloc_hmm_file_name = hmm_loc_file
                config.geoloc_hmm = simplehmm.hmm([],
                                                  [])  # Create new HMM object
                config.geoloc_hmm.load_hmm(config.geoloc_hmm_file_name)

            else:
                print '***** Error: Illegal option:', options[0]
                raise Exception()

    # Open input file and check number of available records - - - - - - - - - - -
    #
    try:
        f_in = open(in_file_name, 'r')
    except:
        inout.log_message('Cannot open input file: ' + in_file_name, 'err')
        raise IOError()

    line_count = 0
    for line in f_in.xreadlines():
        line_count += 1
    f_in.close()

    if ((first_rec + num_rec) > line_count):  # Illegal value for last record
        print '***** Error: Illegal values for number of records to process:',
        print num__rec, ', with start record:', start_rec
        print '*****        File only contains', line_count, 'lines/records'
        raise Exception()

    # Open files  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    try:
        f_in = open(in_file_name, 'r')
    except:
        inout.log_message('Cannot open input file: ' + in_file_name, 'err')
        raise IOError()

    try:
        f_out = open(out_file_name, 'w')
    except:
        inout.log_message('Cannot open output file: ' + out_file_name, 'err')
        raise IOError()

    # Write header (name of output fields) into output file - - - - - - - - - - -
    #
    if (write_header == 1):
        header_dict = {}
        for n in config.output_field_names:
            header_dict.update({n:
                                n})  # Dictionary where values are field names

        header_line = inout.compose_line(header_dict, header=1)
        f_out.write(header_line + os.linesep)

    # Skip over records - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    if (first_rec > 0):
        for i in range(first_rec):
            f_in.readline()

    # Read lines, process them and write into output files  - - - - - - - - - - -
    #
    line_read = 0  # Number of read lines

    while (line_read < num_rec):  # Loop until 'num_rec' records processed
        line = f_in.readline()

        # Print process indicator message
        #
        if (config.proc_ind >= 0) and (line_read >
                                       0):  # Only print if activated
            if (line_read % config.proc_ind == 0):
                print 'Processed line', line_read, 'of', num_rec

        line = line.strip()  # Remove line separators
        config.curr_line = line  # Make a copy of the unprocessed current line

        line = line.lower()  # Make all characters lower case

        inout.log_message(['Record ' + str(line_read + first_rec)], 'v1')
        config.curr_line_no = line_read + first_rec  # Store current line number

        # Process line and extract content into components (name, geocode, etc.)
        #
        [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \
               inout.process_line(line)

        # Make a local empty working copy of the output field dictionary  - - - - -
        #
        output_fields = config.output_field.copy()
        output_fields_keys = output_fields.keys()
        for k in output_fields_keys:
            output_fields[k] = ''  # Set all fields to an empty string

        # Standardise name component  - - - - - - - - - - - - - - - - - - - - - - -
        #
        if (type(name_comp) == types.ListType
            ):  # Givenname and surname separate

            givenname_comp = name_comp[0].strip()
            surname_comp = name_comp[1].strip()

            if (givenname_comp !=
                    ''):  # There is a givenname  - - - - - - - - - - -

                inout.log_message(
                    '  Givenname component: |' + givenname_comp + '|', 'v1')

                givenname_comp = name.clean_name_component(givenname_comp)
                [name_list, tag_list] = name.tag_name_component(givenname_comp)
                output_fields['gender_guess'] = name.get_gender_guess(name_list, \
                                                tag_list)
                [name_list, tag_list, output_fields['title']] = \
                                                 name.get_title(name_list, tag_list)

                [output_fields['givenname'], output_fields['alt_givenname']] = \
                               name.get_name_component(name_list, tag_list, 'gname')

            if (surname_comp !=
                    ''):  # There is a surname  - - - - - - - - - - - - -

                inout.log_message(
                    '  Surname component: |' + surname_comp + '|', 'v1')

                surname_comp = name.clean_name_component(surname_comp)
                [name_list, tag_list] = name.tag_name_component(surname_comp)
                [output_fields['surname'], output_fields['alt_surname']] = \
                                name.get_name_component(name_list, tag_list, 'sname')

        elif (name_comp.strip() !=
              ''):  # Given- and surname both in one field - -

            inout.log_message('  Name component: |' + name_comp + '|', 'v1')

            name_comp = name.clean_name_component(name_comp)
            [name_list, tag_list] = name.tag_name_component(name_comp)

            output_fields['gender_guess'] = name.get_gender_guess(
                name_list, tag_list)

            [name_list, tag_list, output_fields['title']] = \
                                              name.get_title(name_list, tag_list)

            if (config.name_standard_method == 'rules'):
                name_dict = name.get_names_rules(name_list, tag_list, 'gname')

            elif (config.name_standard_method == 'hmm'):
                name_dict = name.get_names_hmm(name_list, tag_list)

            else:
                inout.log_message('Illegal name standardisation method:'+ \
                                  config.name_standard_method,'err')
                raise Exception()

            for (field,
                 value) in name_dict.items():  # Assign to output dictionary
                output_fields[field] = value

        # Standardise geocode and locality components using HMM - - - - - - - - - -
        #
        if (config.geoloc_standard_method == 'hmm') and \
           ((geocode_comp.strip() != '') or (locality_comp.strip() != '')):

            geoloc_comp = geocode_comp.strip() + ' ' + locality_comp.strip()
            inout.log_message('  Geocode and locality component: |'+geoloc_comp+'|',\
                              'v1')

            geoloc_comp = locality.clean_geoloc_component(geoloc_comp)
            [geoloc_words,
             geoloc_tags] = locality.tag_geoloc_component(geoloc_comp)

            if (geoloc_words !=
                []):  # Component not empty, do HMM standardisation

                geoloc_dict = locality.get_geoloc_hmm(geoloc_words,
                                                      geoloc_tags)

                for (field, value
                     ) in geoloc_dict.items():  # Assign to output dictionary
                    output_fields[field] = value

        # Standardise geocode component using rules - - - - - - - - - - - - - - - -
        #
        elif (config.geoloc_standard_method == 'rules') and \
             (geocode_comp.strip() != ''):
            inout.log_message('  Geocode component: |' + geocode_comp + '|',
                              'v1')

            ### TO BE DONE
            inout.log_message('Rules based standardisation for geocode is' + \
                              'not implemented yet','err')
            raise Exception()

        # Standardise locality component using rules  - - - - - - - - - - - - - - -
        #
        elif (config.geoloc_standard_method == 'rules') and \
             (locality_comp.strip() != ''):
            inout.log_message('  Locality component: |' + locality_comp + '|',
                              'v1')

            ### TO BE FINALISED
            inout.log_message('Rules based standardisation for locality is' + \
                              'not implemented yet','err')
            raise Exception()

#      locality_comp = locality.clean_geoloc_component(locality_comp)
#      [loc_words, loc_tags] = locality.tag_geoloc_component(locality_comp)
#
#      [terr,loc_words2,loc_tags2] = locality.get_territory(loc_words,loc_tags)
#      if (terr != ''):
#        output_fields['territory'] = terr
#
#      [pc,loc_words3,loc_tags3] = locality.get_postcode(loc_words2,loc_tags2)
#      if (pc != ''):
#        output_fields['postcode'] = pc
#
#      [loc_name, loc_quali, loc_words4, loc_tags4] = \
#         locality.get_localityname_qualifier(loc_words3, loc_tags3)
#      if (loc_name != ''):
#        output_fields['locality_name'] = loc_name
#      if (loc_quali != ''):
#        output_fields['locality_quali'] = loc_quali
#
#      if (loc_words4 != []):  # Not all words are standardised yet
#        print '  # Remaining word list:', loc_words4  ###### TEST
#        print '  # Remaining tag list: ', loc_tags4   ###### TEST

# Standardise date strings  - - - - - - - - - - - - - - - - - - - - - - - -
#
        if (date1_comp != ''):
            inout.log_message('  Date1 component: |' + date1_comp + '|', 'v1')

            [day1, month1, year1, status1] = date.parse_datestr(date1_comp)
            if (day1 != -1):
                output_fields['day1'] = str(day1)
            if (month1 != -1):
                output_fields['month1'] = str(month1)
            if (year1 != -1):
                output_fields['year1'] = str(year1)

        if (date2_comp != ''):
            inout.log_message('  Date2 component: |' + date2_comp + '|', 'v1')

            [day2, month2, year2, status2] = date.parse_datestr(date2_comp)
            if (day2 != -1):
                output_fields['day2'] = str(day2)
            if (month2 != -1):
                output_fields['month2'] = str(month2)
            if (year2 != -1):
                output_fields['year2'] = str(year2)

        # Create log message of output fields - - - - - - - - - - - - - - - - - - -
        #
        msg = ['  Standardised record output fields:']
        for (field, value) in output_fields.items():
            if (value != '') and (value != []):
                msg.append('    ' + field + ':' + str(value))
        inout.log_message(msg, 'v1')

        # Save standardised record into output field
        #
        out_line = inout.compose_line(output_fields)
        f_out.write(out_line + os.linesep)

        # Increment line counter and go to beginning of loop  - - - - - - - - - - -
        #
        line_read += 1

        inout.log_message('', 'v1')  # Print empty lines between records

    # Close files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    f_in.close()
    f_out.close()

    msg = ['','Number of warnings: '+str(config.num_warning), \
           'Number of corrected word spillings: '+str(config.num_word_spills)]
    inout.log_message(msg, 'v1')

    print msg[1]
    print msg[2]

    inout.log_message('End.', 'v1')