def wakati(self, text, lower=False): """Word segmentation function. Return the segmented words. args: - text (str): An input sentence. - lower (bool): If lower is True, all uppercase characters in a list \ of the words are converted into lowercase characters. return: - words (list): A list of the words. """ text = utils.preprocess(text) lower_text = text.lower() feats = utils.feature_extraction(text=lower_text, uni2id=self._uni2id, bi2id=self._bi2id, dictionary=self._word2id, window_size=self._hp['WINDOW_SIZE']) obs = self._model.encode_ws(feats) obs = [ob.npvalue() for ob in obs] tags = utils.np_viterbi(self._model.trans_array, obs) # A word can be recognized as a single word forcibly. if self.pattern: for match in self.pattern.finditer(text): span = match.span() span_s = span[0] span_e = span[1] if (span_e - span_s) == 1: tags[span_s:span_e] = [3] else: tags[span_s:span_e] = [0]+[1]*((span_e-span_s)-2)+[2] if span_s != 0: previous_tag = tags[span_s-1] if previous_tag == 0: # 0 is BEGIN tag tags[span_s-1] = 3 # 3 is SINGLE tag elif previous_tag == 1: # 1 is MIDDEL tag tags[span_s-1] = 2 # 2 is END tag if span_e != len(text): next_tag = tags[span_e] if next_tag == 1: # 1 is MIDDEL tag tags[span_e] = 0 # 0 is BEGIN tag elif next_tag == 2: # 2 is END tag tags[span_e] = 3 # 3 is SINGLE tag if lower is True: words = utils.segmenter_for_bmes(lower_text, tags) else: words = utils.segmenter_for_bmes(text, tags) return words
def wakati(self, text, lower=False): """ Return the words of the given sentence. Input: str (a sentence) Output: the list of the words """ text = utils.preprocess(text) lower_text = text.lower() feats = utils.feature_extraction(text=lower_text, uni2id=self._uni2id, bi2id=self._bi2id, dictionary=self._word2id, window_size=self._hp['WINDOW_SIZE']) obs = self._model.encode_ws(feats) obs = [ob.npvalue() for ob in obs] tags = utils.np_viterbi(self._model.trans_array, obs) # A word can be recognized as a single word forcibly. if self.pattern: for match in self.pattern.finditer(text): span = match.span() span_s = span[0] span_e = span[1] tags[span_s:span_e] = [0] + [1] * ((span_e - span_s) - 2) + [2] if span_s != 0: previous_tag = tags[span_s - 1] if previous_tag == 0: # 0 is BEGIN tag tags[span_s - 1] = 3 # 3 is SINGLE tag elif previous_tag == 1: # 1 is MIDDEL tag tags[span_s - 1] = 2 # 2 is END tag if span_e != len(text): next_tag = tags[span_e] if next_tag == 1: # 1 is MIDDEL tag tags[span_e] = 0 # 0 is BEGIN tag elif next_tag == 2: # 2 is END tag tags[span_e] = 3 # 3 is SINGLE tag if lower is True: words = utils.segmenter_for_bmes(lower_text, tags) else: words = utils.segmenter_for_bmes(text, tags) return words