def preprocessing_two_class(tweet): tweet = ' '.join(emoji.str2emoji(tweet.split())) tweets = text_processor.pre_process_doc(tweet) tweets = emoji.str2emoji(tweets) tweets = [ lemmatizer.lemmatize(word, grammar[0].lower()) if grammar[0].lower() in ['a', 'n', 'v'] else lemmatizer.lemmatize(word) for word, grammar in pos_tag(tweets) ] tweets = [ tweet for tweet in tweets if (tweet not in punctuation) and (tweet not in stopwords) ] tweet = ' '.join(tweets) return tweet
def standardization2(tweet): tweet = re.sub(r"\\u2019", "'", tweet) tweet = re.sub(r"\\u002c", "'", tweet) tweet = re.sub(r" [0-9]+ ", " ", tweet) tweet = re.sub(r"RT ", "", tweet) tweets = T.tokenize(tweet) tweets = emoji.str2emoji(tweets) tweets = [ lemmatizer.lemmatize(word, grammar[0].lower()) if grammar[0].lower() in ['a', 'n', 'v'] else lemmatizer.lemmatize(word) for word, grammar in pos_tag(tweets) ] tweets = [ tweet for tweet in tweets if (tweet not in punctuation) and (tweet not in stopwords) ] tweets = list(filter(lambda x: x.count('.') < 4, tweets)) return tweets
def standardization(tweet): tweet = re.sub(r"\\u2019", "'", tweet) tweet = re.sub(r"\\u002c", "'", tweet) tweet=' '.join(emoji.str2emoji(unidecode(tweet).lower().split())) tweet = re.sub(r"(http|https)?:\/\/[a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4}(/\S*)?", " ", tweet) tweet = re.sub(r"\'ve", " have", tweet) tweet = re.sub(r" can\'t", " cannot", tweet) tweet = re.sub(r"n\'t", " not", tweet) tweet = re.sub(r"\'re", " are", tweet) tweet = re.sub(r"\'d", " would", tweet) tweet = re.sub(r"\'ll", " will", tweet) tweet = re.sub(r"\'s", "", tweet) tweet = re.sub(r"\'n", "", tweet) tweet = re.sub(r"\'m", " am", tweet) tweet = re.sub(r"@\w+", r' ',tweet) tweet = re.sub(r"#\w+", r' ',tweet) tweet = re.sub(r" [0-9]+ "," ",tweet) tweet = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(tknzr.tokenize(tweet))] tweet = [ i for i in tweet if (i not in stopwords) and (i not in punctuation ) ] tweet = ' '.join(tweet) return tweet
def standardization_teacher(tweet): tweet = re.sub(r"\\u2019", "'", tweet) tweet = re.sub(r"\\u002c", ",", tweet) tweet = emoji.str2emoji(tweet) tweet = re.sub(r"(http|https)?:\/\/[a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4}(/\S*)?", " ", tweet) tweet = re.sub(r"u r ", " you are ", tweet) tweet = re.sub(r"U r ", " you are ", tweet) tweet = re.sub(r" u(\s|$)", " you ", tweet) tweet = re.sub(r"didnt", "did not", tweet) tweet = re.sub(r"\'ve", " have", tweet) tweet = re.sub(r" can\'t", " cannot", tweet) tweet = re.sub(r"n\'t", " not", tweet) tweet = re.sub(r"\'re", " are", tweet) tweet = re.sub(r"\'d", " would", tweet) tweet = re.sub(r"\'ll", " will", tweet) tweet = re.sub(r"\'s", "", tweet) tweet = re.sub(r"\'n", "", tweet) tweet = re.sub(r"\'m", " am", tweet) tweet = re.sub(r"@\w+", r' ', tweet) tweet = re.sub(r"#\w+", r' ', tweet) tweet = re.sub(r" [0-9]+ ", " ", tweet) tweet = re.sub(r" plz[\s|$]", " please ", tweet) tweet = re.sub( r"^([1-9] |1[0-9]| 2[0-9]|3[0-1])(.|-)([1-9] |1[0-2])(.|-|)20[0-9][0-9]", " ", tweet) tweet = [ lemmatizer.lemmatize(i, j[0].lower()) if j[0].lower() in ['a', 'n', 'v'] else lemmatizer.lemmatize(i) for i, j in pos_tag(tknzr.tokenize(tweet)) ] tweet = [ i for i in tweet if (i not in stopwords) and (i not in punctuation) ] tweet = ' '.join(tweet) return tweet.lower()