示例#1
0
def normalize_content(document):
    sentences = sent_tokenize(document)
    sentences = [
        normalize_sentence(sentence) for sentence in sentences
        if StringUtils.is_not_empty(sentence)
    ]
    return ' '.join(sentences)
示例#2
0
def normalize_sentence(sentence):
    sentence = sentence.lower().replace('/', ' ')
    words = word_tokenize(sentence)
    words = [
        normalize_word(word) for word in words
        if StringUtils.is_not_empty(word)
    ]
    sentence = ' '.join(words)
    sentence = add_dot_at_end_of_line(sentence.strip(' \t\n\r'))
    return sentence