Exemplo n.º 1
0
    def viterbi_smoothing(self, observation, states, emission_matrix,
                          transition_matrix):
        #, observation, states, emission_matrix, transition_matrix):
        # initialization
        #print observation
        #print states
        #observation = ['Janet', 'will', 'back', 'the', 'billlklklk', '.']
        #states = ['NNP', 'MD', 'VB', 'JJ', 'NN', 'RB' , 'DT']
        ##states = ['NNP', 'NNS', 'VBZ', '.']

        #observation = ['To', 'be', 'sure', ',', 'Kao', 'would', "n't", 'have', 'an', 'easy', 'time', 'taking', 'U.S.', 'market',
        # 'share', 'away', 'from', 'the', 'mighty', 'P&G', ',', 'which', 'has', 'about', '23', '%', 'of', 'the',
        # 'market', '.']
        #states = ['MD', 'VB', 'VBG', 'JJ', 'NN', 'RBR', ',', '.', 'VBN', 'TO', 'VBP', 'WDT', 'RB', 'IN', 'RP', 'DT', 'CD', 'NNS',
        #'VBZ', 'NNP']

        #observation =['In', 'addition', 'to', '$', '33', 'million', 'compensatory', 'damages', ',', 'the', 'suit', 'seeks', '$',
        # '100', 'million', 'in', 'punitive', 'damages', '.']
        #states = ['VB', 'RP', '$', 'NN', 'FW', 'RBR', 'DT', 'CD', 'TO', 'RB', 'IN', 'VBZ', 'JJ', '.', 'NNS', ',', 'NNP']
        #
        #c_1 = Corpus("dataset\\train.col")
        #c_2 = Corpus("dataset\\train.col")
        #
        #emission_matrix = c_1.dictionary_maker(c_1)
        #transition_matrix = c_2.bigram_preprocess(c_2)

        #print transition_matrix.keys()

        #for x in observation:
        #    print x, emission_matrix[x]

        #for wordem in observation:
        #    print wordem, emission_matrix[wordem]

        #for tagtm in states:
        #    print tagtm, transition_matrix[tagtm]

        viterbi = numpy.zeros((len(observation), len(states)))

        for ini_state in states:
            if observation[0] in emission_matrix:
                if ini_state in emission_matrix[observation[0]].keys():
                    viterbi[0][states.index(ini_state)] = \
                        emission_matrix[observation[0]][ini_state] * transition_matrix[ini_state]['<S>']
            else:
                viterbi[0][states.index(
                    ini_state)] = transition_matrix[ini_state]['<S>']

        #print viterbi

        #a = [1,2,3,4]
        #b = [2,2,2,2]
        #c = numpy.multiply(a,b)
        #print c

        # H@des was here
        # He turned coffee to viterbi HMM decoder

        #start = time.time()

        token = Token()

        for word in observation:
            if word in emission_matrix:
                if observation.index(word) == 0:
                    pass
                else:
                    # print word
                    for prev_state in states:
                        for nex_state in states:
                            feature_factor = token.perceptron_HMM_feature(
                                word, nex_state)
                            if nex_state in emission_matrix[word].keys():
                                #print word, prev_state, nex_state, max(viterbi[observation.index(word) - 1]), transition_matrix[nex_state][prev_state] , emission_matrix[word][nex_state]
                                viterbi[observation.index(word)][states.index(nex_state)] = (transition_matrix[prev_state][nex_state] * \
                                                                                            emission_matrix[word][nex_state]) + \
                                                                                            feature_factor

                            else:
                                viterbi[observation.index(word)][states.index(
                                    prev_state)] = 0.0
            else:
                if observation.index(word) == 0:
                    pass
                else:
                    # print word
                    for prev_state in states:
                        for nex_state in states:
                            feature_factor = token.perceptron_HMM_feature(
                                word, nex_state)
                            viterbi[observation.index(word)][states.index(nex_state)] =  transition_matrix[prev_state][nex_state] + \
                                                                                         feature_factor

        most_probable_tag_sequence = []
        for word in observation:
            if word != ".":
                possible_tags = []
                for state in states:
                    possible_tags.append(
                        viterbi[observation.index(word)][states.index(state)])
                index, value = max(enumerate(possible_tags),
                                   key=operator.itemgetter(1))
                most_probable_tag_sequence.append(states[index])
            else:
                most_probable_tag_sequence.append(".")

        #print viterbi
        #print most_probable_tag_sequence

        #print most_probable_tag_sequence
        #print observation
        return most_probable_tag_sequence, observation