Пример #1
0
def remove_house_token(string):
    words = get_words(string, True)
    for index, word in enumerate(words):
        if index not in xrange(len(words) - 1):
            break
        if word in HOUSE_TOKENS:
            for next_words in words[index + 1:]:
                if is_separator(next_words):
                    continue
                if next_words.isdigit():
                    words[index] = u''
                break
    return u''.join(words)
Пример #2
0
def replace_house_number_tokens(string):
    words = get_words(string, True)
    index = -1
    while True:
        index += 1
        if index not in xrange(len(words) - 1):
            break
        word = words[index]
        for token in filter(lambda token: word == token, HOUSE_NUMBER_TOKENS):
            for next_word in words[index + 1:]:
                if is_separator(next_word):
                    continue
                if is_digit_or_one_alpha(next_word):
                    words[index] = u'корп.'
                    for previous_index, previous_word in enumerate(words[index-1::-1]):
                        if previous_word == u',':
                            break
                        if is_separator(previous_word):
                            continue
                        words[index - previous_index] = u',%s' % words[index - previous_index]
                        break
    return u''.join(words)
Пример #3
0
 def _get_number(self, words, word_index, *args):
     number_indexes = []
     number = u''
     for index, word in enumerate(words[word_index:]):
         if word in BREAK_WORDS:
             break
         if is_separator(word):
             number_indexes.append(word_index + index)
             continue
         if word.isdigit():
             number = word
             number_indexes.append(word_index + index)
             continue
         if len(word) == 1 and word.isalpha():
             number += word
             number_indexes.append(word_index + index)
             continue
         break
     return number, number_indexes
Пример #4
0
def rearranged_words(string):
    words = get_words(string, True)
    for index, word in enumerate(words):
        for token in REARRANGED_WORDS:
            if token == word:
                previous_word_index = index
                while True:
                    previous_word_index -= 1
                    if 0 > previous_word_index:
                        break
                    previous_word = words[previous_word_index]
                    if previous_word == ',':
                        break
                    if is_separator(previous_word):
                        continue
                    if previous_word[0].isupper():
                        words[index], words[previous_word_index] = words[previous_word_index], words[index]
                    break
    string = u''.join(words)
    for token in REARRANGED_WORDS_REPLACEMENT_TOKENS:
        while token in string:
            string = string.replace(token, REARRANGED_WORDS_REPLACEMENT_TOKENS[token])
    return string