Python SymSpell.save_pickle示例

class SpellCheck:
    def __init__(self, progress, directory, countries_dict):
        self.progress = progress
        self.logger = logging.getLogger(__name__)
        self.spelling_update = Counter()
        self.directory = directory
        self.spell_path = os.path.join(self.directory, 'spelling.pkl')
        self.countries_dict = countries_dict
        self.sym_spell = SymSpell()

    def insert(self, name, iso):
        if 'gothland cemetery' not in name and name not in noise_words:
            name_tokens = name.split(' ')
            for word in name_tokens:
                key = f'{word}'
                if len(key) > 2:
                    self.spelling_update[key] += 1

    def write(self):
        # Create blank spelling dictionary
        path = os.path.join(self.directory, 'spelling.tmp')
        fl = open(path, 'w')
        fl.write('the,1\n')
        fl.close()
        success = self.sym_spell.create_dictionary(corpus=path)
        if not success:
            self.logger.error(f"error creating spelling dictionary")

        self.logger.info('Building Spelling Dictionary')

        # Add all words from geonames into spelling dictionary
        for key in self.spelling_update:
            self.sym_spell.create_dictionary_entry(
                key=key, count=self.spelling_update[key])

        self.logger.info('Writing Spelling Dictionary')
        self.sym_spell.save_pickle(self.spell_path)

    def read(self):
        success = False
        if os.path.exists(self.spell_path):
            self.logger.info(
                f'Loading Spelling Dictionary from {self.spell_path}')
            success = self.sym_spell.load_pickle(self.spell_path)
        else:
            self.logger.error(
                f"spelling dictionary not found: {self.spell_path}")

        if not success:
            self.logger.error(
                f"error loading spelling dictionary from {self.spell_path}")
        else:
            self.sym_spell.delete_dictionary_entry(key='gothland')

        size = len(self.sym_spell.words)
        self.logger.info(f"Spelling Dictionary contains {size} words")

    def lookup(self, input_term):
        #suggestions = [SymSpell.    SuggestItem]
        if '*' in input_term:
            return input_term
        res = ''
        if len(input_term) > 1:
            suggestions = self.sym_spell.lookup(input_term,
                                                Verbosity.CLOSEST,
                                                max_edit_distance=2,
                                                include_unknown=True)
            for idx, item in enumerate(suggestions):
                if idx > 3:
                    break
                #self.logger.debug(f'{item._term}')
                if item._term[0] == input_term[0]:
                    # Only accept results where first letter matches
                    res += item._term + ' '
            return res
        else:
            return input_term

    def lookup_compound(self, phrase):
        suggestions = self.sym_spell.lookup_compound(phrase=phrase,
                                                     max_edit_distance=2,
                                                     ignore_non_words=False)
        for item in suggestions:
            self.logger.debug(f'{item._term}')
        return suggestions[0]._term

    def fix_spelling(self, text):
        new_text = text
        if bool(re.search(r'\d', text)):
            # Has digits, just return text, no spellcheck
            pass
        elif 'st ' in text:
            # Spellcheck not handling St properly
            pass
        else:
            if len(text) > 0:
                new_text = self.lookup(text)
                self.logger.debug(f'Spell {text} -> {new_text}')

        return new_text.strip(' ')

示例#2

显示文件

class Postprocessor:
    def __init__(self):
        self.sym_spell = None

        # CONSTANTES
        self.max_edit_distance_dictionary = config.max_edit_distance_dictionary
        self.prefix_length = 7
        self.term_index = 0
        self.count_index = 1
        self.dir_path_vocabulary = os.path.join(os.path.dirname(__file__),
                                                '../docs/vocabularies/')
        self.dir_path_output = os.path.join(os.path.dirname(__file__),
                                            '../docs/outputs/')
        self.suggestion_verbosity = config.suggestion_verbosity

    def build_symspell(self):
        self.sym_spell = SymSpell(self.max_edit_distance_dictionary,
                                  self.prefix_length)

    def save_sysmspell(self, filename):
        if not self.sym_spell:
            raise ValueError('Symspell not created')

        print('guardando symspell...')
        self.sym_spell.save_pickle(self.dir_path_vocabulary + filename)
        print('symspell guardado\n')

    def load_vocabulary(self, vocabulary_name):
        if not self.sym_spell:
            self.build_symspell()

        if not vocabulary_name:
            raise ValueError('Vocabulary is required')

        print('cargando vocabulario', config.vocabulary)

        extension = os.path.splitext(vocabulary_name)[1].lower()

        if extension == '.txt':
            if not self.sym_spell.load_dictionary(
                    self.dir_path_vocabulary + vocabulary_name,
                    self.term_index,
                    self.count_index,
                    encoding='utf8'):
                raise ValueError('Dictionary file not found')
        elif extension == '.pkl':
            if not self.sym_spell.load_pickle(self.dir_path_vocabulary +
                                              vocabulary_name):
                raise ValueError('Dictionary file not found')
        else:
            raise ValueError('Unsupported file not found')

        print('vocabulario cargado\n')

    def tokenize_text(self, text):
        tokenized_text = [
            list(word_tokenize(sent)) for sent in sent_tokenize(text)
        ]

        train_data, padded_sents = padded_everygram_pipeline(
            NGRAM, tokenized_text)
        return padded_sents

    def tokenize_by_sent(self, text):
        # Se tokeniza el text por sentencia y luego por palabra. Se pasa cada palabra a minúsculas.
        tokenized_text = [list(map(str.lower, sent_tokenize(text)))]
        return tokenized_text

    def have_errors(self, word, vocabulary, tokens, i):
        def is_non_alphanumeric(s):
            return all(not c.isalnum() for c in s)

        def is_first_word_paragraph(tokens, i):
            return i != 0 and tokens[i - 1] == '<s>'

        def has_format_like_date(text):
            pattern = re.compile("^(\d+)$|^((\w+\/){2})\w+$")
            return bool(pattern.match(text))

        def have_ortography_error(word):
            return word not in [
                '<s>', '</s>'
            ] and (not is_non_alphanumeric(word)
                   and not word.isnumeric()) and word not in vocabulary

        word_lower = str.lower(word)

        if ("//||" in word or "||//" in word):
            return False
        elif config.correct_upper_case_first_letter and config.correct_upper_case:
            return have_ortography_error(
                word_lower) and not has_format_like_date(word_lower)

        elif config.correct_upper_case_first_letter and not config.correct_upper_case:
            return have_ortography_error(word_lower) and not (
                word.isupper() and not any(char.isdigit() for char in word)
            ) and not has_format_like_date(word_lower)

        elif not config.correct_upper_case_first_letter and config.correct_upper_case:
            return have_ortography_error(word_lower) and \
                (not word[0].isupper() or is_first_word_paragraph(tokens, i)) and not has_format_like_date(word_lower)

        else:
            return have_ortography_error(word_lower) and \
                   (not word.isupper() and not any(char.isdigit() for char in word)) and \
                (not word[0].isupper() or is_first_word_paragraph(tokens, i)) and not has_format_like_date(word_lower)

    def find_correction_candidates(self, error_word):
        error_word_low = str.lower(error_word)
        max_edit_distance_lookup = config.edit_distance_conf(
            len(error_word_low))

        suggestions = self.sym_spell.lookup(error_word_low,
                                            self.suggestion_verbosity,
                                            max_edit_distance_lookup)
        return [{"value": s.term, "distance": s.distance} for s in suggestions]

    def process_with_edit_distance_and_google_language_model(
            self, text, language_model):
        # result = {
        #     "corrected_text": 'texto corregido',
        #     "tokens": [
        #         {"value": "Bien", "isError": False},
        #         {"value": "Mal", "isError": True, "correction": "Malito", "suggestions": []}
        #     ]
        # }

        if not self.sym_spell:
            raise ValueError('SymSpell not initialized')

        print('procesando con edit distance y modelo de lenguage de ' +
              config.language_model)

        result = {"tokens": [], "corrected_text": ""}
        vocabulary = self.sym_spell._words
        tokenized_text = list(deepcopy(self.tokenize_text(text)))
        # print('text tokenized:', tokenized_text)
        tokenized_corrected_text = list(deepcopy(self.tokenize_text(text)))

        for i, word in enumerate(tokenized_text):

            if self.have_errors(word, vocabulary, tokenized_text, i):
                # Busco candidatos cercanos para corregir la palabra errónea
                candidates = self.find_correction_candidates(word)

                # Obtengo las palabras previas a la palabra errónea (contexto) sin tomar en cuenta símbolos de inicio y fin de sentencia
                previous_words = [
                    w for w in tokenized_corrected_text[i - NGRAM + 1:i]
                    if w not in ['<s>', '</s>']
                ]

                # Mientras no se encuentra candidato se sigue buscando con un contexto cada vez más pequeño.
                # Nota: Al buscar de esta manera se da prioridad a ngramas de mayor tamaño.
                corrected_word = word
                while corrected_word == word:
                    # print('error: ' + word)
                    # print('contexto: ' + str(previous_words))
                    corrected_word, scored_suggestions = language_model.get_language_model_correction(
                        word, previous_words, candidates,
                        config.context_direction)

                    # Si ya se probó con todo contexto posible (incluso unigramas) o se encontró candidato, salgo del loop.
                    if not previous_words or corrected_word != word:
                        break
                    previous_words.pop(0)

                # Actualizo el texto tokenizado para tener cuenta las correcciones
                tokenized_corrected_text[i] = corrected_word

                result["corrected_text"] += corrected_word + ' '
                result["tokens"].append({
                    "previous_words": previous_words,
                    "value": word,
                    "isError": True,
                    "suggestions": scored_suggestions,
                    "correction": corrected_word
                })
            elif word not in ['<s>', '</s>']:
                result["corrected_text"] += word + ' '
                result["tokens"].append({
                    "value": word,
                    "isError": False,
                    "correction": word
                })

            if i and i % 20 == 0:
                result["corrected_text"] += '\n'

        print('procesamiento finalizado\n')
        return result

    def clean_context(self, context, vocabulary, iterate_backwards=False):
        def normalize(word):
            leave_only_alphanum = lambda word: re.sub("[\W\d_]+", '', word)

            return leave_only_alphanum(word).lower()

        new_context = []
        if iterate_backwards: context.reverse()
        for word in context:
            if config.language_model == '1_billion' and word in [
                    '<s>', '</s>'
            ]:
                new_context.append(word)
            elif (word in vocabulary or \
                word.upper() in vocabulary or \
                word.lower() in vocabulary or \
                word.capitalize() in vocabulary):
                new_context.append(word) if word not in [
                    '<s>', '</s>'
                ] else new_context.append(word.upper())
            elif normalize(word) in vocabulary:
                new_context.append(normalize(word))
            else:
                break

        if iterate_backwards: new_context.reverse()

        return new_context

    def get_contexts(self, tokenized_corrected_text, i, vocabulary):
        contexts = {}

        if (config.context_direction in "previous,all"):
            # Obtengo las palabras previas a la palabra errónea (contexto) sin tomar en cuenta símbolos de inicio y fin de sentencia
            if config.language_model == '1_billion':
                previous_words = [
                    w for w in tokenized_corrected_text[i - NGRAM + 1:i]
                ]
            else:
                previous_words = [
                    w for w in tokenized_corrected_text[i - NGRAM + 1:i]
                    if w not in ['<s>', '</s>']
                ]

            previous_words = self.clean_context(previous_words,
                                                vocabulary,
                                                iterate_backwards=True)

            contexts["previous_words"] = previous_words

        if (config.context_direction in "forward,all"):
            # Obtengo las palabras siguientes a la palabra errónea (contexto) sin tomar en cuenta símbolos de inicio y fin de sentencia
            if config.language_model == '1_billion':
                forward_words = [
                    w for w in tokenized_corrected_text[i + 1:i + NGRAM]
                ]
            else:
                forward_words = [
                    w for w in tokenized_corrected_text[i + 1:i + NGRAM]
                    if w not in ['<s>', '</s>']
                ]

            forward_words = self.clean_context(forward_words, vocabulary)

            contexts["forward_words"] = forward_words

        if (config.context_direction in "middle,all"):
            # Obtengo la palabras previa y posterior a la palabra errónea (contexto) sin tomar en cuenta símbolos de inicio y fin de sentencia
            if config.language_model == '1_billion':
                middle_previous_words = [
                    w for w in tokenized_corrected_text[i - (NGRAM) // 2:i]
                ]
                middle_forward_words = [
                    w for w in tokenized_corrected_text[i + 1:i +
                                                        (NGRAM) // 2 + 1]
                ]
            else:
                middle_previous_words = [
                    w for w in tokenized_corrected_text[i - (NGRAM) // 2:i]
                    if w not in ['<s>', '</s>']
                ]
                middle_forward_words = [
                    w for w in tokenized_corrected_text[i + 1:i +
                                                        (NGRAM) // 2 + 1]
                    if w not in ['<s>', '</s>']
                ]

            middle_previous_words = self.clean_context(middle_previous_words,
                                                       vocabulary,
                                                       iterate_backwards=True)
            middle_forward_words = self.clean_context(middle_forward_words,
                                                      vocabulary)

            contexts["middle_previous_words"] = middle_previous_words
            contexts["middle_forward_words"] = middle_forward_words

        return contexts

    def get_correction_word(self, word, contexts, language_model, candidates):
        previous_words = []
        forward_words = []
        middle_previous_words = []
        middle_forward_words = []
        scored_suggestions_prev = []
        scored_suggestions_middle = []
        scored_suggestions_forw = []

        prev_win = 0
        middle_win = 0
        forw_win = 0
        prev_midd_win = 0
        prev_forw_win = 0
        midd_forw_win = 0
        equal_win = 0

        if "previous_words" in contexts:
            previous_words = contexts["previous_words"]

            # print("Contexto previo:")
            # print(previous_words)
            # Mientras no se encuentra candidato se sigue buscando con un contexto cada vez más pequeño.
            # Nota: Al buscar de esta manera se da prioridad a ngramas de mayor tamaño.
            scored_suggestions_prev = []
            corrected_word_prev = word
            while corrected_word_prev == word:
                # print('error: ' + word)
                # print('contexto: ' + str(previous_words))
                corrected_word_prev, scored_suggestions_prev = language_model.get_language_model_correction(
                    word, previous_words, [], candidates, "previous")
                # print("scored_suggestions_prev")
                # print(scored_suggestions_prev)

                # Si ya se probó con todo contexto posible (incluso unigramas) o se encontró candidato, salgo del loop.
                if not previous_words or corrected_word_prev != word:
                    break
                previous_words.pop(0)

        # Contexto SIGUENTE
        if "forward_words" in contexts:
            # Obtengo las palabras siguientes a la palabra errónea (contexto) sin tomar en cuenta símbolos de inicio y fin de sentencia
            forward_words = contexts["forward_words"]

            # print("Contexto forward:")
            # print(forward_words)

            # Mientras no se encuentra candidato se sigue buscando con un contexto cada vez más pequeño.
            # Nota: Al buscar de esta manera se da prioridad a ngramas de mayor tamaño.
            corrected_word_forw = word
            while corrected_word_forw == word:
                # print('error: ' + word)
                # print('contexto: ' + str(forward_words))
                corrected_word_forw, scored_suggestions_forw = language_model.get_language_model_correction(
                    word, [], forward_words, candidates, "forward")

                # Si ya se probó con t odo contexto posible (incluso unigramas) o se encontró candidato, salgo del loop.
                if not forward_words or corrected_word_forw != word:
                    break
                forward_words.pop()

        #CONTEXTO EN EL MEDIO
        if "middle_previous_words" in contexts:
            # Obtengo la palabras previa y posterior a la palabra errónea (contexto) sin tomar en cuenta símbolos de inicio y fin de sentencia

            middle_previous_words = contexts["middle_previous_words"]
            middle_forward_words = contexts["middle_forward_words"]

            # print("Contexto middle:")
            # print(middle_previous_words)
            # print(middle_forward_words)

            # Mientras no se encuentra candidato se sigue buscando con un contexto cada vez más pequeño.
            # Nota: Al buscar de esta manera se da prioridad a ngramas de mayor tamaño.
            corrected_word_middle = word
            while corrected_word_middle == word:
                # print('error: ' + word)
                # print('contexto: ' + str(middle_previous_words + middle_forward_words))
                corrected_word_middle, scored_suggestions_middle = language_model.get_language_model_correction(
                    word, middle_previous_words, middle_forward_words,
                    candidates, "middle")

                # Si ya se probó con t odo contexto posible (incluso unigramas) o se encontró candidato, salgo del loop.
                if not middle_forward_words or corrected_word_middle != word:
                    if not middle_previous_words or corrected_word_middle != word:
                        break

                if middle_forward_words:
                    middle_forward_words.pop()
                else:
                    middle_previous_words.pop()

        corrected_word = word

        len_previous_words = len(previous_words)
        len_forward_words = len(forward_words)
        len_middle_words = len(middle_previous_words) + len(
            middle_forward_words)
        if len_previous_words > len_forward_words and len_previous_words > len_middle_words:
            # print("Gano Previous word")
            prev_win = prev_win + 1
            corrected_word = corrected_word_prev
            scored_suggestions = scored_suggestions_prev
        elif len_forward_words > len_previous_words and len_forward_words > len_middle_words:
            # print("Gano Forward word")
            forw_win = forw_win + 1
            corrected_word = corrected_word_forw
            scored_suggestions = scored_suggestions_forw
        elif len_middle_words > len_previous_words and len_middle_words > len_forward_words:
            # print("Gano middle word")
            middle_win = middle_win + 1
            corrected_word = corrected_word_middle
            scored_suggestions = scored_suggestions_middle
        elif len_previous_words == len_forward_words and len_previous_words > len_middle_words:
            # print("Gano previous y forward word")
            prev_forw_win = prev_forw_win + 1
            corrected_word, scored_suggestions = self.sum_score_and_select_maximun(
                candidates, word, scored_suggestions_prev,
                scored_suggestions_forw)
        elif len_previous_words == len_middle_words and len_previous_words > len_forward_words:
            # print("Gano previous y middle word")
            prev_midd_win = prev_midd_win + 1
            corrected_word, scored_suggestions = self.sum_score_and_select_maximun(
                candidates, word, scored_suggestions_prev,
                scored_suggestions_middle)
        elif len_middle_words == len_forward_words and len_middle_words > len_previous_words:
            # print("Gano middle y forward word")
            midd_forw_win = midd_forw_win + 1
            corrected_word, scored_suggestions = self.sum_score_and_select_maximun(
                candidates, word, scored_suggestions_middle,
                scored_suggestions_forw)
        else:
            equal_win = equal_win + 1
            scored_suggestions = []
            for n in range(0, len(candidates)):
                suggestions = {}
                suggestions["value"] = candidates[n]["value"]
                suggestions["score"] = 0
                #Por si no están inicializados
                if (scored_suggestions_prev != []):
                    suggestions["score"] += scored_suggestions_prev[n]['score']
                if (scored_suggestions_middle != []):
                    suggestions["score"] += scored_suggestions_middle[n][
                        'score']
                if (scored_suggestions_forw != []):
                    scored_suggestions_forw[n]['score']
                # suggestions["score"] = scored_suggestions_prev[n]['score'] + scored_suggestions_middle[n]['score']+ scored_suggestions_forw[n]['score']
                suggestions["distance"] = candidates[n]["distance"]
                scored_suggestions.append(suggestions)
                #Se obtiene el candidato con mayor score
            if not scored_suggestions == []:
                index_corrected_word = max(
                    range(len(scored_suggestions)),
                    key=lambda index: scored_suggestions[index]['score'])
                corrected_word = scored_suggestions[index_corrected_word][
                    "value"]
            else:
                corrected_word = word

        return {
            "word": word,
            "corrected_word": corrected_word,
            "previous_words": previous_words,
            "forward_words": forward_words,
            "scored_suggestions": scored_suggestions
        }

    def process_with_edit_distance_and_language_model_with_direction(
            self, text, language_model):
        # result = {
        #     "corrected_text": 'texto corregido',
        #     "tokens": [
        #         {"value": "Bien", "isError": False},
        #         {"value": "Mal", "isError": True, "correction": "Malito", "suggestions": []}
        #     ]
        # }

        if not self.sym_spell:
            raise ValueError('SymSpell not initialized')

        print('procesando con edit distance y modelo de lenguage de ' +
              config.language_model)

        result = {"tokens": [], "corrected_text": ""}
        vocabulary = self.sym_spell._words
        tokenized_text = list(deepcopy(self.tokenize_text(text)))
        # print('text tokenized:', tokenized_text)
        tokenized_corrected_text = list(deepcopy(self.tokenize_text(text)))

        # print("*******************")
        # for i, word in enumerate(tokenized_text):
        #     if ("//||" in word or "||//" in word):
        #         print(word)
        for i, word in enumerate(tokenized_text):

            # print(self.have_errors(word, vocabulary, tokenized_text, i))
            if self.have_errors(word, vocabulary, tokenized_text, i):
                # Busco candidatos cercanos para corregir la palabra errónea
                candidates = self.find_correction_candidates(word)

                contexts = self.get_contexts(tokenized_corrected_text, i,
                                             vocabulary)

                correction_and_parameters = self.get_correction_word(
                    word, contexts, language_model, candidates)

                word = correction_and_parameters["word"]
                corrected_word = correction_and_parameters["corrected_word"]
                previous_words = correction_and_parameters["previous_words"]
                forward_words = correction_and_parameters["forward_words"]
                scored_suggestions = correction_and_parameters[
                    "scored_suggestions"]

                # Actualizo el texto tokenizado para tener cuenta las correcciones
                tokenized_corrected_text[i] = corrected_word

                result["corrected_text"] += corrected_word + ' '
                result["tokens"].append({
                    "previous_words": previous_words,
                    "forwards_words": forward_words,
                    "value": word,
                    "isError": True,
                    "suggestions": scored_suggestions,
                    "correction": corrected_word
                })
            elif word not in ['<s>', '</s>']:
                if ("//||" in word or "||//" in word):
                    clean_word = word.replace("//||", "").replace("||//", "")
                    result["corrected_text"] += clean_word + ' '
                    result["tokens"].append({
                        "value": word,
                        "isError": False,
                        "correction": clean_word
                    })
                else:
                    result["corrected_text"] += word + ' '
                    result["tokens"].append({
                        "value": word,
                        "isError": False,
                        "correction": word
                    })

            if i and i % 20 == 0:
                result["corrected_text"] += '\n'

        print('procesamiento finalizado\n')
        return result

    def sum_score_and_select_maximun(self, candidates, word,
                                     score_suggestions_1, score_suggestions_2):
        scored_suggestions = []
        for n in range(0, len(candidates)):
            suggestions = {}
            suggestions["value"] = candidates[n]["value"]
            suggestions["score"] = score_suggestions_1[n][
                'score'] + score_suggestions_2[n]['score']
            suggestions["distance"] = candidates[n]["distance"]
            scored_suggestions.append(suggestions)
            #Se obtiene el candidato con mayor score
        if not scored_suggestions == []:
            index_corrected_word = max(
                range(len(scored_suggestions)),
                key=lambda index: scored_suggestions[index]['score'])
            corrected_word = scored_suggestions[index_corrected_word]["value"]
            return [corrected_word, scored_suggestions]
        else:
            return [word, scored_suggestions]

    def correct_errors_process(self, text, language_model):
        if (config.language_model == 'google'):
            return self.process_with_edit_distance_and_language_model_with_direction(
                text, language_model)
        elif (config.language_model == 'elmo'):
            return self.process_with_elmo(text, language_model)
        elif (config.language_model == '1_billion'):
            return self.process_with_edit_distance_and_language_model_with_direction(
                text, language_model)

        return self.process_with_edit_distance_and_language_model_with_direction(
            text, language_model)

    def process_with_elmo(self, text, language_model):
        print('procesando con elmo...')
        print('context direction: ', config.context_direction)
        print()

        # Load vocabulary
        vocabulary = []
        with open(os.path.dirname(os.path.realpath(__file__)) +
                  '/../docs/vocabularies/elmo_vocabulary.txt',
                  encoding='utf-8') as fin:
            for line in fin:
                vocabulary.append(line.strip())

        result = {"tokens": [], "corrected_text": ""}
        tokenized_text = list(deepcopy(self.tokenize_text(text)))
        tokenized_corrected_text = list(deepcopy(self.tokenize_text(text)))

        for i, word in enumerate(tokenized_text):

            if self.have_errors(word, vocabulary, tokenized_text, i):
                # Busco candidatos cercanos para corregir la palabra errónea
                candidates = self.find_correction_candidates(word)

                # Obtengo las palabras previas o siguientes a la palabra errónea (contexto)
                if (config.context_direction == 'previous'):
                    context = [
                        w for w in tokenized_corrected_text[i - NGRAM + 1:i]
                    ]
                    context = self.clean_context(context,
                                                 vocabulary,
                                                 iterate_backwards=True)
                elif (config.context_direction == 'forward'):
                    context = [
                        w for w in tokenized_corrected_text[i + 1:i + NGRAM]
                    ]
                    context = self.clean_context(context, vocabulary)
                else:
                    context = [
                        w for w in tokenized_corrected_text[i - NGRAM + 1:i]
                    ]
                    context = self.clean_context(context,
                                                 vocabulary,
                                                 iterate_backwards=True)

                # Mientras no se encuentra candidato se sigue buscando con un contexto cada vez más pequeño.
                # Nota: Al buscar de esta manera se da prioridad a ngramas de mayor tamaño.
                corrected_word = word
                while corrected_word == word:
                    # print('error: ' + word)
                    # print('contexto: ' + str(context))
                    corrected_word, scored_suggestions = language_model.get_elmo_correction(
                        word, context, candidates, vocabulary)

                    # Si ya se probó con todo contexto posible (incluso unigramas) o se encontró candidato, salgo del loop.
                    if not context or corrected_word != word:
                        break
                    context.pop(0)

                # Actualizo el texto tokenizado para tener cuenta las correcciones
                tokenized_corrected_text[i] = corrected_word

                result["corrected_text"] += corrected_word + ' '
                result["tokens"].append({
                    "value": word,
                    "isError": True,
                    "suggestions": scored_suggestions,
                    "correction": corrected_word
                })

                if (config.context_direction == 'previous'):
                    result["tokens"][-1]["previous_words"] = context
                    result["tokens"][-1]["forwards_words"] = ''
                elif (config.context_direction == 'forward'):
                    result["tokens"][-1]["previous_words"] = ''
                    result["tokens"][-1]["forwards_words"] = context
                else:
                    result["tokens"][-1]["previous_words"] = context
                    result["tokens"][-1]["forwards_words"] = ''

            elif word not in ['<s>', '</s>']:
                result["corrected_text"] += word + ' '
                result["tokens"].append({
                    "value": word,
                    "isError": False,
                    "correction": word
                })

            if i and i % 20 == 0:
                result["corrected_text"] += '\n'

        print('procesamiento finalizado\n')
        return result

    def process_with_regex(self, text):
        print('procesando con expresiones regulares...')

        text = re.sub(r'\.-', '.', text)

        text = re.sub(r"''", '"', text)

        text = re.sub(r"``", '"', text)

        text = re.sub(r"’’", '"', text)

        #Se suplantan caracteres que dividen tokens por otro que no divide (y no aparece con abby)
        text = re.sub(r"&", '½', text)

        text = re.sub(
            r"(\w)(\*\*)( |\n)",
            lambda match_obj: match_obj.group(1) + '-' + match_obj.group(3),
            text)

        # Remove empty lines
        lines = text.split("\n")
        non_empty_lines = [line for line in lines if line.strip() != ""]

        string_without_empty_lines = ""
        for line in non_empty_lines:
            string_without_empty_lines += line + "\n"

        text = string_without_empty_lines

        print('procesamiento regex finalizado\n')
        return text

    def align_documents(self, source, target):
        os.system('./bleualign.py -s ' + source + ' -t ' + target + '-o ' +
                  source)