def __extract_morphological_information(self, mrph_object, is_feature, is_surface): """This method extracts morphlogical information from token object. """ assert isinstance(mrph_object, pyknp.Morpheme) assert isinstance(is_feature, bool) assert isinstance(is_surface, bool) surface = mrph_object.midasi word_stem = mrph_object.genkei tuple_pos = (mrph_object.hinsi, mrph_object.bunrui) misc_info = { 'katuyou1': mrph_object.katuyou1, 'katuyou2': mrph_object.katuyou2, 'imis': mrph_object.imis, 'repname': mrph_object.repname } token_object = TokenizedResult(node_obj=None, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=surface, is_feature=is_feature, is_surface=is_surface, misc_info=misc_info) return token_object
def __result_parser(self, analyzed_line, is_feature, is_surface): # type: (text_type,bool,bool)->TokenizedResult """Extract surface word and feature from analyzed line. Extracted elements are returned with TokenizedResult class """ assert isinstance(analyzed_line, str) assert isinstance(is_feature, bool) assert isinstance(is_surface, bool) surface, features = analyzed_line.split('\t', 1) tuple_pos, word_stem = self.__feature_parser(features, surface) tokenized_obj = TokenizedResult(node_obj=None, analyzed_line=analyzed_line, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=surface, is_feature=is_feature, is_surface=is_surface) return tokenized_obj
def __extract_morphological_information(self, kytea_tags_tuple, is_feature): # type: (Tuple[text_type,List[Any]], bool) -> TokenizedResult """This method extracts morphlogical information from token object. """ assert isinstance(kytea_tags_tuple, tuple) assert isinstance(is_feature, bool) surface = self.__check_char_set(kytea_tags_tuple[0]) # NOTE: kytea does NOT show word stem. Put blank string instead. if six.PY2: word_stem = ''.decode('utf-8') else: word_stem = '' pos_tuple = kytea_tags_tuple[1][0] pos = self.__check_char_set(pos_tuple[0][0]) pos_score = float(pos_tuple[0][1]) yomi_tuple = kytea_tags_tuple[1][1] yomi = self.__check_char_set(yomi_tuple[0][0]) yomi_score = float(yomi_tuple[0][1]) tuple_pos = (pos, ) misc_info = { 'pos_score': pos_score, 'pos': pos, 'yomi': yomi, 'yomi_score': yomi_score } token_object = TokenizedResult(node_obj=None, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=surface, is_feature=is_feature, is_surface=True, misc_info=misc_info) return token_object
def __extract_morphological_information(self, kytea_tags_tuple, is_feature): """This method extracts morphlogical information from token object. """ assert isinstance(kytea_tags_tuple, tuple) assert isinstance(is_feature, bool) surface = self.__check_char_set(kytea_tags_tuple[0]) word_stem = '' pos_tuple = kytea_tags_tuple[1][0] pos = self.__check_char_set(pos_tuple[0][0]) pos_score = float(pos_tuple[0][1]) yomi_tuple = kytea_tags_tuple[1][1] yomi = self.__check_char_set(yomi_tuple[0][0]) yomi_score = float(yomi_tuple[0][1]) tuple_pos = (pos, ) misc_info = { 'pos_score': pos_score, 'pos': pos, 'yomi': yomi, 'yomi_score': yomi_score } token_object = TokenizedResult(analyzed_line=None, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=surface, is_feature=is_feature, is_surface=True, misc_info=misc_info, node_obj=None) return token_object
def tokenize(self, sentence, normalized=True, is_feature=False, is_surface=False, return_list=False, func_normalizer=normalize_text): """* What you can do - Call mecab tokenizer, and return tokenized objects """ # type: (text_type, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence] if six.PY2 and isinstance(sentence, str): sentence = sentence.decode(self.string_encoding) else: pass ### decide normalization function depending on dictType if func_normalizer is None and self._dictType == 'neologd': normalized_sentence = neologdn.normalize(sentence) elif func_normalizer == normalize_text: normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType) elif func_normalizer is None: normalized_sentence = sentence else: normalized_sentence = func_normalizer(sentence) # don't delete this variable. The variable "encoded_text" protects sentence from deleting if six.PY2: encoded_text = normalized_sentence.encode(self.string_encoding) else: encoded_text = normalized_sentence if six.PY2: tokenized_objects = [] node = self.mecabObj.parseToNode(encoded_text) node = node.next while node.next is not None: word_surface = node.surface.decode(self.string_encoding) tuple_pos, word_stem = self.__feature_parser(node.feature.decode(self.string_encoding), word_surface) tokenized_obj = TokenizedResult( node_obj=node, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=word_surface, is_feature=is_feature, is_surface=is_surface ) tokenized_objects.append(tokenized_obj) node = node.next tokenized_sentence = TokenizedSenetence( sentence=sentence, tokenized_objects=tokenized_objects) else: parsed_result = self.mecabObj.parse(encoded_text) tokenized_objects = self.__postprocess_analyzed_result( string_mecab_parsed_result=parsed_result, is_feature=is_feature, is_surface=is_surface ) tokenized_sentence = TokenizedSenetence( sentence=sentence, tokenized_objects=tokenized_objects ) # type: TokenizedSenetence if return_list: return tokenized_sentence.convert_list_object() else: return tokenized_sentence