def data_preprocessing(text): text = text.apply(lambda x: " ".join(x.lower() for x in x.split())) text = text.apply(lambda x: " ".join(x.strip() for x in x.split())) text = text.str.replace('[^\w\s]', '') text = text.str.replace('\d+', '') text = text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) return text
def remove_noise(text): # Make lowercase text = text.apply(lambda x: " ".join(x.lower() for x in x.split())) # Remove whitespaces text = text.apply(lambda x: " ".join(x.strip() for x in x.split())) # Remove special characters text = text.apply(lambda x: "".join( [" " if ord(i) < 32 or ord(i) > 126 else i for i in x])) # Remove punctuation text = text.str.replace('[^\w\s]', '') # Remove numbers text = text.str.replace('\d+', '') # Remove Stopwords text = text.apply( lambda x: ' '.join([word for word in x.split() if word not in (stop)])) # Convert to string text = text.astype(str) return text
def preprocess(data, column): text = data[column] lower_text = text.apply(lambda x: x.lower()) combined = lower_text.apply(lambda x: combine_words(x, blm_dict)) lemmatized_df = combined.apply(lambda x: lem(x)) content = lemmatized_df.apply(lambda x: ' '.join(x)) content = content.values vector, features = vectorize(content) return vector, features
def sentiment_analyser(text): return text.apply( lambda Text: pd.Series(TextBlob(Text).sentiment.polarity))