def replace_by_words(self, tokenizer, replacable): resolved = '' if tokenizer is None: words = replacable.split() else: tokenizer.is_convert = False tokenizer.is_punctuation = False words = tokenizer.texts_to_words(replacable) tokenizer.is_convert = True tokenizer.is_punctuation = True if len(words) == 0: return resolved match_count = 0 word_no = 0 is_match = False last_CJK = True for word in words: if match_count > 0: match_count -= 1 word_no += 1 continue word_CJK = JapaneseLanguage.is_CJKword(word) if word_CJK is True: pairs = self._pairs_jp matchs = self._match_jp keyword = word[0] else: pairs = self._pairs matchs = self._match keyword = word if keyword in matchs: phrases = matchs[keyword] match_count, key = self.match(word_CJK, words, word_no, phrases) if match_count > 0: resolved += pairs[key] match_count -= 1 is_match = True word_CJK = JapaneseLanguage.is_CJKword(pairs[key]) else: if is_match is False: if word_CJK is False or (last_CJK is False and word_CJK is True): resolved += ' ' is_match = False resolved += word word_no += 1 last_CJK = word_CJK return resolved.strip()
def add_to_lookup(self, org_key, org_value): key = org_key.strip() target_key = JapaneseLanguage.zenhan_normalize(key) target_key = target_key.upper() value = org_value.strip() if JapaneseLanguage.is_CJKword(target_key) is True: if target_key in self._pairs_jp: YLogger.error(self, "%s = %s already exists in jp_collection", key, value) return else: matchs = self._match_jp splits = target_key check_key = target_key[0] self._pairs_jp[target_key] = value else: if target_key in self._pairs: YLogger.error(self, "%s = %s already exists in en_collection", key, value) return else: matchs = self._match splits = target_key.split() check_key = splits[0] self._pairs[target_key] = value if check_key not in matchs: matchs[check_key] = [] matchs[check_key].append(splits)
def replace_by_words(self, tokenizer, replacable): resolved = '' if tokenizer is None: words = replacable.split() else: tokenizer.is_convert = False tokenizer.is_punctuation = False words = tokenizer.texts_to_words(replacable) tokenizer.is_convert = True tokenizer.is_punctuation = True if len(words) == 0: return resolved match_count = 0 word_no = 0 new_words = [] for word in words: if match_count > 0: match_count -= 1 word_no += 1 continue word_CJK = JapaneseLanguage.is_CJKword(word) if word_CJK is True: pairs = self._pairs_jp matchs = self._match_jp keyword = word[0] else: pairs = self._pairs matchs = self._match keyword = word if keyword in matchs: phrases = matchs[keyword] match_count, key = self.match(word_CJK, words, word_no, phrases) if match_count > 0: new_words.append(pairs[key]) match_count -= 1 else: new_words.append(word) word_no += 1 if len(new_words) > 0: if tokenizer is None: to_join = [ word.strip() for word in new_words if word and word != ' ' ] resolved = " ".join(to_join) else: resolved = tokenizer.words_to_texts(new_words) return resolved
def replace_by_words(self, tokenizer, replacable): resolved = '' if tokenizer is None: words = replacable.split() else: tokenizer.is_punctuation = False words = tokenizer.texts_to_words(replacable) tokenizer.is_punctuation = True if len(words) == 0: return resolved last_CJK = True match_count = 0 word_no = 0 for word in words: if match_count > 0: match_count -= 1 word_no += 1 continue target_word = JapaneseLanguage.zenhan_normalize(word) target_word = target_word.upper() is_CJK = JapaneseLanguage.is_CJKword(target_word) if is_CJK is True: pairs = self._pairs_jp matchs = self._match_jp else: pairs = self._pairs matchs = self._match if is_CJK is True: keyword = target_word[0] else: keyword = target_word if keyword in matchs: phrases = matchs[keyword] match_count, key = self.match(is_CJK, words, word_no, phrases) if match_count > 0: if is_CJK is False or last_CJK != is_CJK: resolved += ' ' resolved += pairs[key] match_count -= 1 else: if is_CJK is False or last_CJK != is_CJK: resolved += ' ' resolved += word last_CJK = is_CJK word_no += 1 return resolved.strip()
def texts_to_words(self, texts): if not texts: return [] if self._is_convert is True: han_texts = mojimoji.zen_to_han(texts, kana=False) zen_texts = mojimoji.han_to_zen(han_texts, digit=False, ascii=False) else: han_texts = texts zen_texts = texts if JapaneseLanguage.is_CJKword(zen_texts) is True: if self._is_template is False: words = [] target_text = '' words_CJK = JapaneseLanguage.is_CJKchar(zen_texts[0]) for ch in zen_texts: char_CJK = JapaneseLanguage.is_CJKchar(ch) if words_CJK != char_CJK: if words_CJK is True: tmp_words = self._texts_to_words_jp(target_text) else: tmp_words = self._texts_to_words_en(target_text) for word in tmp_words: words.append(word) words_CJK = char_CJK target_text = '' target_text += ch if len(target_text) > 0: if words_CJK is True: tmp_words = self._texts_to_words_jp(target_text) else: tmp_words = self._texts_to_words_en(target_text) for word in tmp_words: words.append(word) else: words = self._template_texts_to_words_jp(texts) else: if self._is_template is False: words = self._texts_to_words_en(han_texts) else: words = self._texts_to_words_en(texts) return words
def add_to_lookup(self, org_key, org_value): key = org_key value = org_value.strip() if JapaneseLanguage.is_CJKword(org_key) is True: key = key.strip() if key in self._pairs_jp: YLogger.error(self, "%s = %s already exists in jp_collection", key, value) return else: matchs = self._match_jp splits = key check_key = key[0] self._pairs_jp[key] = value else: if key[0] != ' ': key = key.strip() pattern_text = DoubleStringPatternSplitCollection.normalise_pattern( key) start = pattern_text.lstrip() middle = pattern_text end = pattern_text.rstrip() pattern = "(^%s|%s|%s$)" % (start, middle, end) replacement = value replaceInfo = [key, re.compile(pattern), replacement] self._replace.append(replaceInfo) return else: key = key.strip() if key in self._pairs: YLogger.error(self, "%s = %s already exists in en_collection", key, value) return else: matchs = self._match splits = key.split() check_key = splits[0] self._pairs[key] = value if check_key not in matchs: matchs[check_key] = [] matchs[check_key].append(splits)
def __init__(self, attribs, text, userid='*', element=None): PatternNode.__init__(self, userid) self._words = {} self._values = {} if 'words' in attribs: words = attribs['words'] elif text: words = text else: raise ParserException("No words specified as attribute or text", xml_element=element, nodename='iset') check_words = JapaneseLanguage.zenhan_normalize(words) self._is_CJK = JapaneseLanguage.is_CJKword(check_words) if self._parse_words(words) is False: raise ParserException("empty element in words", xml_element=element, nodename='iset') self._iset_name = "iset_%d" % (PatternISetNode.iset_count) PatternISetNode.iset_count += 1
def add_to_lookup(self, org_key, org_value, filename=None, line=0): key = org_key.strip() value = org_value if key == '': error_info = "key is empty" self.set_error_info(filename, line, error_info) return if JapaneseLanguage.is_CJKword(org_key) is True: if key in self._pairs_jp: YLogger.error(self, "%s = %s already exists in jp_collection", key, value) error_info = "duplicate key='%s' (value='%s' is invalid)" % ( key, value) self.set_error_info(filename, line, error_info) return else: matchs = self._match_jp splits = key check_key = key[0] self._pairs_jp[key] = value else: if key in self._pairs: YLogger.error(self, "%s = %s already exists in en_collection", key, value) error_info = "duplicate key='%s' (value='%s' is invalid)" % ( key, value) self.set_error_info(filename, line, error_info) return else: matchs = self._match splits = key.split() check_key = splits[0] self._pairs[key] = value if check_key not in matchs: matchs[check_key] = [] matchs[check_key].append(splits)
def check_cjk(self, is_cjk, value): if is_cjk is False: check_words = JapaneseLanguage.zenhan_normalize(value) if JapaneseLanguage.is_CJKword(check_words) is True: is_cjk = True return is_cjk
def add_to_lookup(self, org_key, org_value, filename=None, line=0): key = org_key value = org_value.strip() if key.strip() == '': error_info = "key is empty" self.set_error_info(filename, line, error_info) return if JapaneseLanguage.is_CJKword(org_key) is True: key = key.strip() if key in self._pairs_jp: YLogger.error(self, "%s = %s already exists in jp_collection", key, value) error_info = "duplicate key='%s' (value='%s' is invalid)" % ( key, value) self.set_error_info(filename, line, error_info) return else: matchs = self._match_jp splits = key check_key = key[0] self._pairs_jp[key] = value else: if key[0] != ' ': key = key.strip() if key in self._replace_key: YLogger.error( self, "%s = %s already exists in replace_collection", key, value) error_info = "duplicate replace_chars='%s' (value='%s' is invalid)" % ( key, value) self.set_error_info(filename, line, error_info) return pattern_text = DoubleStringPatternSplitCollection.normalise_pattern( key) start = pattern_text.lstrip() middle = pattern_text end = pattern_text.rstrip() pattern = "(^%s|%s|%s$)" % (start, middle, end) replacement = value replaceInfo = [key, re.compile(pattern), replacement] self._replace.append(replaceInfo) self._replace_key.append(key) return else: key = key.strip() if key in self._pairs: YLogger.error(self, "%s = %s already exists in en_collection", key, value) error_info = "duplicate key='%s' (value='%s' is invalid)" % ( key, value) self.set_error_info(filename, line, error_info) return else: matchs = self._match splits = key.split() check_key = splits[0] self._pairs[key] = value if check_key not in matchs: matchs[check_key] = [] matchs[check_key].append(splits)