def remove_house_token(string): words = get_words(string, True) for index, word in enumerate(words): if index not in xrange(len(words) - 1): break if word in HOUSE_TOKENS: for next_words in words[index + 1:]: if is_separator(next_words): continue if next_words.isdigit(): words[index] = u'' break return u''.join(words)
def replace_house_number_tokens(string): words = get_words(string, True) index = -1 while True: index += 1 if index not in xrange(len(words) - 1): break word = words[index] for token in filter(lambda token: word == token, HOUSE_NUMBER_TOKENS): for next_word in words[index + 1:]: if is_separator(next_word): continue if is_digit_or_one_alpha(next_word): words[index] = u'корп.' for previous_index, previous_word in enumerate(words[index-1::-1]): if previous_word == u',': break if is_separator(previous_word): continue words[index - previous_index] = u',%s' % words[index - previous_index] break return u''.join(words)
def _get_number(self, words, word_index, *args): number_indexes = [] number = u'' for index, word in enumerate(words[word_index:]): if word in BREAK_WORDS: break if is_separator(word): number_indexes.append(word_index + index) continue if word.isdigit(): number = word number_indexes.append(word_index + index) continue if len(word) == 1 and word.isalpha(): number += word number_indexes.append(word_index + index) continue break return number, number_indexes
def rearranged_words(string): words = get_words(string, True) for index, word in enumerate(words): for token in REARRANGED_WORDS: if token == word: previous_word_index = index while True: previous_word_index -= 1 if 0 > previous_word_index: break previous_word = words[previous_word_index] if previous_word == ',': break if is_separator(previous_word): continue if previous_word[0].isupper(): words[index], words[previous_word_index] = words[previous_word_index], words[index] break string = u''.join(words) for token in REARRANGED_WORDS_REPLACEMENT_TOKENS: while token in string: string = string.replace(token, REARRANGED_WORDS_REPLACEMENT_TOKENS[token]) return string