示例#1
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        #raise NotImplementedError('HMM.emission_model')

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences

        new_data = []
        for x in range(len(train_data)):
            new_data += train_data[x]

        data = [(tag, word.lower()) for (word, tag) in new_data]
        # print(data[:20])
        # COMPLETED compute the emission model
        emission_FD = ConditionalFreqDist(data)
        est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01,
                                                   emission_FD.B() + 1)

        self.emission_PD = ConditionalProbDist(emission_FD, est)
        self.states = emission_FD.keys()
        #print(self.states[0])

        return self.emission_PD, self.states
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.
        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        #raise NotImplementedError('HMM.emission_model')

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences

        data = []
        for sent in train_data:  #for each sentence
            for tuples in sent:  #for each pair of (word,tag) in every sentence
                data.append(
                    (tuples[1], tuples[0].lower()))  #list of tuples(tag,word)

        emission_FD = ConditionalFreqDist(data)
        # this is the estiamtor used for probability distribution
        est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01,
                                                   emission_FD.B() + 1)

        self.emission_PD = ConditionalProbDist(emission_FD, est)
        self.states = list(emission_FD.keys())
        #print(self.states)

        return self.emission_PD, self.states
示例#3
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        #raise NotImplementedError('HMM.transition_model')

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        data = []
        for sent in train_data:
            data.append(("<s>", sent[0][1]))  #start symbol
            for i in range(len(sent) - 1):
                data.append((sent[i][1], sent[i + 1][1]))
            data.append((sent[len(sent) - 1][1], "</s>"))  #end symbol

        transition_FD = ConditionalFreqDist(data)
        #same estimator used for emission_model
        est = lambda transition_FD: LidstoneProbDist(transition_FD, 0.01,
                                                     transition_FD.B() + 1)
        self.transition_PD = ConditionalProbDist(transition_FD, est)

        return self.transition_PD
示例#4
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        data = []
        #[[(tag, word.lower()) for (word, tag) in sent]for sent in train_data]
        for sent in train_data:
            for (word, tag) in sent:
                data.append((tag, word.lower()))
                self.states.append(tag)

        emission_FD = ConditionalFreqDist(data)

        lidstone_estimator = lambda emission_FD: LidstoneProbDist(
            emission_FD, 0.01,
            emission_FD.B() + 1)

        self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator)
        self.states = list(set(self.states))

        return self.emission_PD, self.states