示例#1
0
    def wakati(self, text, lower=False):
        """Word segmentation function. Return the segmented words.

        args:
            - text (str): An input sentence.
            - lower (bool): If lower is True, all uppercase characters in a list \
                            of the words are converted into lowercase characters.

        return:
            - words (list): A list of the words.
        """
        text = utils.preprocess(text)
        lower_text = text.lower()
        feats = utils.feature_extraction(text=lower_text,
                                         uni2id=self._uni2id,
                                         bi2id=self._bi2id,
                                         dictionary=self._word2id,
                                         window_size=self._hp['WINDOW_SIZE'])
        obs  = self._model.encode_ws(feats)
        obs  = [ob.npvalue() for ob in obs]
        tags = utils.np_viterbi(self._model.trans_array, obs)

        # A word can be recognized as a single word forcibly.
        if self.pattern:
            for match in self.pattern.finditer(text):
                span = match.span()
                span_s = span[0]
                span_e = span[1]

                if (span_e - span_s) == 1:
                    tags[span_s:span_e] = [3]
                else:
                    tags[span_s:span_e] = [0]+[1]*((span_e-span_s)-2)+[2]

                if span_s != 0:
                    previous_tag = tags[span_s-1]
                    if previous_tag == 0:   # 0 is BEGIN tag
                        tags[span_s-1] = 3  # 3 is SINGLE tag
                    elif previous_tag == 1: # 1 is MIDDEL tag
                        tags[span_s-1] = 2  # 2 is END tag

                if span_e != len(text):
                    next_tag = tags[span_e]
                    if next_tag == 1:    # 1 is MIDDEL tag
                        tags[span_e] = 0 # 0 is BEGIN tag
                    elif next_tag == 2:  # 2 is END tag
                        tags[span_e] = 3 # 3 is SINGLE tag

        if lower is True:
            words = utils.segmenter_for_bmes(lower_text, tags)
        else:
            words = utils.segmenter_for_bmes(text, tags)
        return words
示例#2
0
    def wakati(self, text, lower=False):
        """
        Return the words of the given sentence.
        Input: str (a sentence)
        Output: the list of the words
        """
        text = utils.preprocess(text)
        lower_text = text.lower()
        feats = utils.feature_extraction(text=lower_text,
                                         uni2id=self._uni2id,
                                         bi2id=self._bi2id,
                                         dictionary=self._word2id,
                                         window_size=self._hp['WINDOW_SIZE'])
        obs = self._model.encode_ws(feats)
        obs = [ob.npvalue() for ob in obs]
        tags = utils.np_viterbi(self._model.trans_array, obs)

        # A word can be recognized as a single word forcibly.
        if self.pattern:
            for match in self.pattern.finditer(text):
                span = match.span()
                span_s = span[0]
                span_e = span[1]
                tags[span_s:span_e] = [0] + [1] * ((span_e - span_s) - 2) + [2]

                if span_s != 0:
                    previous_tag = tags[span_s - 1]
                    if previous_tag == 0:  # 0 is BEGIN tag
                        tags[span_s - 1] = 3  # 3 is SINGLE tag
                    elif previous_tag == 1:  # 1 is MIDDEL tag
                        tags[span_s - 1] = 2  # 2 is END tag

                if span_e != len(text):
                    next_tag = tags[span_e]
                    if next_tag == 1:  # 1 is MIDDEL tag
                        tags[span_e] = 0  # 0 is BEGIN tag
                    elif next_tag == 2:  # 2 is END tag
                        tags[span_e] = 3  # 3 is SINGLE tag

        if lower is True:
            words = utils.segmenter_for_bmes(lower_text, tags)
        else:
            words = utils.segmenter_for_bmes(text, tags)
        return words