Exemplo n.º 1
0
def pre_process(description):
    '''
    pre-processa a descricao
    '''

    # compila regex de caracteres nao-especiais
    vanilla = u'[^\u0041-\u005A \
                  \u0061-\u007A \
                  \u00C0-\u00D6 \
                  \u00D8-\u00F6 \
                  \u00F8-\u00FF \
                  \u0100-\u017F \
                  \u0020]'

    regex = re.compile(vanilla)

    # poe tudo em minusculas
    description = description.encode('utf8').decode('utf8')
    lowercased = description.lower()

    # remove caracteres especiais e numeros
    regexed = regex.sub(' ', lowercased)

    # separa palavras
    tokenized = regexed.split()

    # passa o que esta no plural p/ singular
    st = RSLPStemmer()
    singularized = [st.apply_rule(token, 0) for token in tokenized]

    # remove palavras c/ menos de 2 caracteres
    # e mescla palavras novamente
    remerged = ''
    for word in singularized:
        if len(word) > 1:
            remerged += word + ' '

    return remerged
Exemplo n.º 2
0
def pre_process(description):
    '''
    pre-processa a descricao
    '''

    # compila regex de caracteres nao-especiais
    vanilla = u'[^\u0041-\u005A \
                  \u0061-\u007A \
                  \u00C0-\u00D6 \
                  \u00D8-\u00F6 \
                  \u00F8-\u00FF \
                  \u0100-\u017F \
                  \u0020]'
    regex = re.compile(vanilla)

    # poe tudo em minusculas
    description = description.encode('utf8').decode('utf8')
    lowercased = description.lower()

    # remove caracteres especiais e numeros
    regexed = regex.sub(' ', lowercased)

    # separa palavras
    tokenized = regexed.split()

    # passa o que esta no plural p/ singular
    st = RSLPStemmer()
    singularized = [st.apply_rule(token, 0) for token in tokenized]

    # remove palavras c/ menos de 2 caracteres
    # e mescla palavras novamente
    remerged = ''
    for word in singularized:
        if len(word) > 1:
            remerged += word + ' '

    return remerged