예제 #1
0
파일: locale.py 프로젝트: BlueWhale0-0/KYXL
 def translate_search(self, search_string, settings=None):
     dashes = ['-', '——', '—', '~']
     sentences = self._sentence_split(search_string, settings=settings)
     dictionary = self._get_dictionary(settings=settings)
     translated = []
     original = []
     for sentence in sentences:
         original_tokens, simplified_tokens = self._simplify_split_align(
             sentence, settings=settings)
         translated_chunk = []
         original_chunk = []
         for i, word in enumerate(simplified_tokens):
             if word == '' or word == ' ':
                 translated_chunk.append(word)
                 original_chunk.append(original_tokens[i])
             elif word in dictionary and word not in dashes:
                 translated_chunk.append(dictionary[word])
                 original_chunk.append(original_tokens[i])
             elif word.strip(
                     '()\"\'{}[],.،') in dictionary and word not in dashes:
                 punct = word[len(word.strip('()\"\'{}[],.،')):]
                 if punct and dictionary[word.strip('()\"\'{}[],.،')]:
                     translated_chunk.append(
                         dictionary[word.strip('()\"\'{}[],.،')] + punct)
                 else:
                     translated_chunk.append(
                         dictionary[word.strip('()\"\'{}[],.،')])
                 original_chunk.append(original_tokens[i])
             elif self._token_with_digits_is_ok(word):
                 translated_chunk.append(word)
                 original_chunk.append(original_tokens[i])
             # Use original token because word_is_tz is case sensitive
             elif translated_chunk and word_is_tz(original_tokens[i]):
                 translated_chunk.append(word)
                 original_chunk.append(original_tokens[i])
             else:
                 if translated_chunk:
                     translated.append(translated_chunk)
                     translated_chunk = []
                     original.append(original_chunk)
                     original_chunk = []
         if translated_chunk:
             translated.append(translated_chunk)
             original.append(original_chunk)
     for i in range(len(translated)):
         if "in" in translated[i]:
             translated[i] = self._clear_future_words(translated[i])
         translated[i] = self._join_chunk(list(filter(bool, translated[i])),
                                          settings=settings)
         original[i] = self._join_chunk(list(filter(bool, original[i])),
                                        settings=settings)
     return translated, original
예제 #2
0
    def translate_search(self, search_string, settings=None):
        dashes = ['-', '——', '—', '~']
        word_joint_unsupported_languages = ["zh", "ja"]
        sentences = self._sentence_split(search_string, settings=settings)
        dictionary = self._get_dictionary(settings=settings)
        translated = []
        original = []
        for sentence in sentences:
            original_tokens, simplified_tokens = self._simplify_split_align(
                sentence, settings=settings)
            translated_chunk = []
            original_chunk = []
            last_token_index = len(simplified_tokens) - 1
            skip_next_token = False
            for i, word in enumerate(simplified_tokens):
                next_word = simplified_tokens[
                    i + 1] if i < last_token_index else ""
                current_and_next_joined = self._join_chunk([word, next_word],
                                                           settings=settings)
                if skip_next_token:
                    skip_next_token = False
                    continue

                if word == '' or word == ' ':
                    translated_chunk.append(word)
                    original_chunk.append(original_tokens[i])
                elif (current_and_next_joined in dictionary
                      and word not in dashes and self.shortname
                      not in word_joint_unsupported_languages):
                    translated_chunk.append(
                        dictionary[current_and_next_joined])
                    original_chunk.append(
                        self._join_chunk(
                            [original_tokens[i], original_tokens[i + 1]],
                            settings=settings))
                    skip_next_token = True
                elif word in dictionary and word not in dashes:
                    translated_chunk.append(dictionary[word])
                    original_chunk.append(original_tokens[i])
                elif word.strip(
                        '()\"\'{}[],.،') in dictionary and word not in dashes:
                    punct = word[len(word.strip('()\"\'{}[],.،')):]
                    if punct and dictionary[word.strip('()\"\'{}[],.،')]:
                        translated_chunk.append(
                            dictionary[word.strip('()\"\'{}[],.،')] + punct)
                    else:
                        translated_chunk.append(
                            dictionary[word.strip('()\"\'{}[],.،')])
                    original_chunk.append(original_tokens[i])
                elif self._token_with_digits_is_ok(word):
                    translated_chunk.append(word)
                    original_chunk.append(original_tokens[i])
                # Use original token because word_is_tz is case sensitive
                elif translated_chunk and word_is_tz(original_tokens[i]):
                    translated_chunk.append(word)
                    original_chunk.append(original_tokens[i])
                else:
                    if translated_chunk:
                        translated.append(translated_chunk)
                        translated_chunk = []
                        original.append(original_chunk)
                        original_chunk = []
            if translated_chunk:
                translated.append(translated_chunk)
                original.append(original_chunk)
        for i in range(len(translated)):
            if "in" in translated[i]:
                translated[i] = self._clear_future_words(translated[i])
            translated[i] = self._join_chunk(list(filter(bool, translated[i])),
                                             settings=settings)
            original[i] = self._join_chunk(list(filter(bool, original[i])),
                                           settings=settings)
        return translated, original