예제 #1
0
def data_preprocessing(text):
    text = text.apply(lambda x: " ".join(x.lower() for x in x.split()))
    text = text.apply(lambda x: " ".join(x.strip() for x in x.split()))
    text = text.str.replace('[^\w\s]', '')
    text = text.str.replace('\d+', '')
    text = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    return text
예제 #2
0
def remove_noise(text):

    # Make lowercase
    text = text.apply(lambda x: " ".join(x.lower() for x in x.split()))

    # Remove whitespaces
    text = text.apply(lambda x: " ".join(x.strip() for x in x.split()))

    # Remove special characters
    text = text.apply(lambda x: "".join(
        [" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))

    # Remove punctuation
    text = text.str.replace('[^\w\s]', '')

    # Remove numbers
    text = text.str.replace('\d+', '')

    # Remove Stopwords
    text = text.apply(
        lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    # Convert to string
    text = text.astype(str)

    return text
def preprocess(data, column):
    text = data[column]
    lower_text = text.apply(lambda x: x.lower())
    combined = lower_text.apply(lambda x: combine_words(x, blm_dict))
    lemmatized_df = combined.apply(lambda x: lem(x))
    content = lemmatized_df.apply(lambda x: ' '.join(x))
    content = content.values
    vector, features = vectorize(content)
    return vector, features
예제 #4
0
def sentiment_analyser(text):
    return text.apply(
        lambda Text: pd.Series(TextBlob(Text).sentiment.polarity))