示例#1
0
    def replace_by_words(self, tokenizer, replacable):
        resolved = ''

        if tokenizer is None:
            words = replacable.split()
        else:
            tokenizer.is_convert = False
            tokenizer.is_punctuation = False
            words = tokenizer.texts_to_words(replacable)
            tokenizer.is_convert = True
            tokenizer.is_punctuation = True
        if len(words) == 0:
            return resolved

        match_count = 0
        word_no = 0
        is_match = False
        last_CJK = True
        for word in words:
            if match_count > 0:
                match_count -= 1
                word_no += 1
                continue

            word_CJK = JapaneseLanguage.is_CJKword(word)
            if word_CJK is True:
                pairs = self._pairs_jp
                matchs = self._match_jp
                keyword = word[0]
            else:
                pairs = self._pairs
                matchs = self._match
                keyword = word

            if keyword in matchs:
                phrases = matchs[keyword]
                match_count, key = self.match(word_CJK, words, word_no,
                                              phrases)
            if match_count > 0:
                resolved += pairs[key]
                match_count -= 1
                is_match = True
                word_CJK = JapaneseLanguage.is_CJKword(pairs[key])
            else:
                if is_match is False:
                    if word_CJK is False or (last_CJK is False
                                             and word_CJK is True):
                        resolved += ' '
                is_match = False
                resolved += word
            word_no += 1
            last_CJK = word_CJK

        return resolved.strip()
示例#2
0
    def add_to_lookup(self, org_key, org_value):
        key = org_key.strip()
        target_key = JapaneseLanguage.zenhan_normalize(key)
        target_key = target_key.upper()
        value = org_value.strip()

        if JapaneseLanguage.is_CJKword(target_key) is True:
            if target_key in self._pairs_jp:
                YLogger.error(self, "%s = %s already exists in jp_collection", key, value)
                return
            else:
                matchs = self._match_jp
                splits = target_key
                check_key = target_key[0]
                self._pairs_jp[target_key] = value
        else:
            if target_key in self._pairs:
                YLogger.error(self, "%s = %s already exists in en_collection", key, value)
                return
            else:
                matchs = self._match
                splits = target_key.split()
                check_key = splits[0]
                self._pairs[target_key] = value

        if check_key not in matchs:
            matchs[check_key] = []
        matchs[check_key].append(splits)
示例#3
0
    def replace_by_words(self, tokenizer, replacable):
        resolved = ''

        if tokenizer is None:
            words = replacable.split()
        else:
            tokenizer.is_convert = False
            tokenizer.is_punctuation = False
            words = tokenizer.texts_to_words(replacable)
            tokenizer.is_convert = True
            tokenizer.is_punctuation = True
        if len(words) == 0:
            return resolved

        match_count = 0
        word_no = 0
        new_words = []
        for word in words:
            if match_count > 0:
                match_count -= 1
                word_no += 1
                continue

            word_CJK = JapaneseLanguage.is_CJKword(word)
            if word_CJK is True:
                pairs = self._pairs_jp
                matchs = self._match_jp
                keyword = word[0]
            else:
                pairs = self._pairs
                matchs = self._match
                keyword = word

            if keyword in matchs:
                phrases = matchs[keyword]
                match_count, key = self.match(word_CJK, words, word_no,
                                              phrases)
            if match_count > 0:
                new_words.append(pairs[key])
                match_count -= 1
            else:
                new_words.append(word)

            word_no += 1

        if len(new_words) > 0:
            if tokenizer is None:
                to_join = [
                    word.strip() for word in new_words if word and word != ' '
                ]
                resolved = " ".join(to_join)
            else:
                resolved = tokenizer.words_to_texts(new_words)
        return resolved
示例#4
0
    def replace_by_words(self, tokenizer, replacable):
        resolved = ''

        if tokenizer is None:
            words = replacable.split()
        else:
            tokenizer.is_punctuation = False
            words = tokenizer.texts_to_words(replacable)
            tokenizer.is_punctuation = True
        if len(words) == 0:
            return resolved

        last_CJK = True
        match_count = 0
        word_no = 0
        for word in words:
            if match_count > 0:
                match_count -= 1
                word_no += 1
                continue

            target_word = JapaneseLanguage.zenhan_normalize(word)
            target_word = target_word.upper()
            is_CJK = JapaneseLanguage.is_CJKword(target_word)
            if is_CJK is True:
                pairs = self._pairs_jp
                matchs = self._match_jp
            else:
                pairs = self._pairs
                matchs = self._match

            if is_CJK is True:
                keyword = target_word[0]
            else:
                keyword = target_word
            if keyword in matchs:
                phrases = matchs[keyword]
                match_count, key = self.match(is_CJK, words, word_no, phrases)
            if match_count > 0:
                if is_CJK is False or last_CJK != is_CJK:
                    resolved += ' '
                resolved += pairs[key]
                match_count -= 1
            else:
                if is_CJK is False or last_CJK != is_CJK:
                    resolved += ' '
                resolved += word
            last_CJK = is_CJK
            word_no += 1

        return resolved.strip()
示例#5
0
    def texts_to_words(self, texts):
        if not texts:
            return []

        if self._is_convert is True:
            han_texts = mojimoji.zen_to_han(texts, kana=False)
            zen_texts = mojimoji.han_to_zen(han_texts,
                                            digit=False,
                                            ascii=False)
        else:
            han_texts = texts
            zen_texts = texts

        if JapaneseLanguage.is_CJKword(zen_texts) is True:
            if self._is_template is False:
                words = []
                target_text = ''
                words_CJK = JapaneseLanguage.is_CJKchar(zen_texts[0])
                for ch in zen_texts:
                    char_CJK = JapaneseLanguage.is_CJKchar(ch)
                    if words_CJK != char_CJK:
                        if words_CJK is True:
                            tmp_words = self._texts_to_words_jp(target_text)
                        else:
                            tmp_words = self._texts_to_words_en(target_text)
                        for word in tmp_words:
                            words.append(word)
                        words_CJK = char_CJK
                        target_text = ''
                    target_text += ch
                if len(target_text) > 0:
                    if words_CJK is True:
                        tmp_words = self._texts_to_words_jp(target_text)
                    else:
                        tmp_words = self._texts_to_words_en(target_text)
                    for word in tmp_words:
                        words.append(word)
            else:
                words = self._template_texts_to_words_jp(texts)
        else:
            if self._is_template is False:
                words = self._texts_to_words_en(han_texts)
            else:
                words = self._texts_to_words_en(texts)
        return words
示例#6
0
    def add_to_lookup(self, org_key, org_value):
        key = org_key
        value = org_value.strip()

        if JapaneseLanguage.is_CJKword(org_key) is True:
            key = key.strip()
            if key in self._pairs_jp:
                YLogger.error(self, "%s = %s already exists in jp_collection",
                              key, value)
                return
            else:
                matchs = self._match_jp
                splits = key
                check_key = key[0]
                self._pairs_jp[key] = value
        else:
            if key[0] != ' ':
                key = key.strip()
                pattern_text = DoubleStringPatternSplitCollection.normalise_pattern(
                    key)
                start = pattern_text.lstrip()
                middle = pattern_text
                end = pattern_text.rstrip()
                pattern = "(^%s|%s|%s$)" % (start, middle, end)
                replacement = value
                replaceInfo = [key, re.compile(pattern), replacement]
                self._replace.append(replaceInfo)
                return
            else:
                key = key.strip()
                if key in self._pairs:
                    YLogger.error(self,
                                  "%s = %s already exists in en_collection",
                                  key, value)
                    return
                else:
                    matchs = self._match
                    splits = key.split()
                    check_key = splits[0]
                    self._pairs[key] = value

        if check_key not in matchs:
            matchs[check_key] = []
        matchs[check_key].append(splits)
示例#7
0
    def __init__(self, attribs, text, userid='*', element=None):
        PatternNode.__init__(self, userid)
        self._words = {}
        self._values = {}

        if 'words' in attribs:
            words = attribs['words']
        elif text:
            words = text
        else:
            raise ParserException("No words specified as attribute or text", xml_element=element, nodename='iset')

        check_words = JapaneseLanguage.zenhan_normalize(words)
        self._is_CJK = JapaneseLanguage.is_CJKword(check_words)
        if self._parse_words(words) is False:
            raise ParserException("empty element in words", xml_element=element, nodename='iset')

        self._iset_name = "iset_%d" % (PatternISetNode.iset_count)
        PatternISetNode.iset_count += 1
示例#8
0
    def add_to_lookup(self, org_key, org_value, filename=None, line=0):
        key = org_key.strip()
        value = org_value

        if key == '':
            error_info = "key is empty"
            self.set_error_info(filename, line, error_info)
            return
        if JapaneseLanguage.is_CJKword(org_key) is True:
            if key in self._pairs_jp:
                YLogger.error(self, "%s = %s already exists in jp_collection",
                              key, value)
                error_info = "duplicate key='%s' (value='%s' is invalid)" % (
                    key, value)
                self.set_error_info(filename, line, error_info)
                return
            else:
                matchs = self._match_jp
                splits = key
                check_key = key[0]
                self._pairs_jp[key] = value
        else:
            if key in self._pairs:
                YLogger.error(self, "%s = %s already exists in en_collection",
                              key, value)
                error_info = "duplicate key='%s' (value='%s' is invalid)" % (
                    key, value)
                self.set_error_info(filename, line, error_info)
                return
            else:
                matchs = self._match
                splits = key.split()
                check_key = splits[0]
                self._pairs[key] = value

        if check_key not in matchs:
            matchs[check_key] = []
        matchs[check_key].append(splits)
示例#9
0
 def check_cjk(self, is_cjk, value):
     if is_cjk is False:
         check_words = JapaneseLanguage.zenhan_normalize(value)
         if JapaneseLanguage.is_CJKword(check_words) is True:
             is_cjk = True
     return is_cjk
示例#10
0
    def add_to_lookup(self, org_key, org_value, filename=None, line=0):
        key = org_key
        value = org_value.strip()

        if key.strip() == '':
            error_info = "key is empty"
            self.set_error_info(filename, line, error_info)
            return
        if JapaneseLanguage.is_CJKword(org_key) is True:
            key = key.strip()
            if key in self._pairs_jp:
                YLogger.error(self, "%s = %s already exists in jp_collection",
                              key, value)
                error_info = "duplicate key='%s' (value='%s' is invalid)" % (
                    key, value)
                self.set_error_info(filename, line, error_info)
                return
            else:
                matchs = self._match_jp
                splits = key
                check_key = key[0]
                self._pairs_jp[key] = value
        else:
            if key[0] != ' ':
                key = key.strip()
                if key in self._replace_key:
                    YLogger.error(
                        self, "%s = %s already exists in replace_collection",
                        key, value)
                    error_info = "duplicate replace_chars='%s' (value='%s' is invalid)" % (
                        key, value)
                    self.set_error_info(filename, line, error_info)
                    return
                pattern_text = DoubleStringPatternSplitCollection.normalise_pattern(
                    key)
                start = pattern_text.lstrip()
                middle = pattern_text
                end = pattern_text.rstrip()
                pattern = "(^%s|%s|%s$)" % (start, middle, end)
                replacement = value
                replaceInfo = [key, re.compile(pattern), replacement]
                self._replace.append(replaceInfo)
                self._replace_key.append(key)
                return
            else:
                key = key.strip()
                if key in self._pairs:
                    YLogger.error(self,
                                  "%s = %s already exists in en_collection",
                                  key, value)
                    error_info = "duplicate key='%s' (value='%s' is invalid)" % (
                        key, value)
                    self.set_error_info(filename, line, error_info)
                    return
                else:
                    matchs = self._match
                    splits = key.split()
                    check_key = splits[0]
                    self._pairs[key] = value

        if check_key not in matchs:
            matchs[check_key] = []
        matchs[check_key].append(splits)