/
preparation.py
36 lines (27 loc) · 1.09 KB
/
preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# encoding: utf-8
import pandas as pd
from pymongo import MongoClient
import nltk
from nltk.stem import RSLPStemmer
from gensim.utils import simple_preprocess
#nltk.download()
conn = MongoClient('mongodb://localhost:27017')
print(conn)
db = conn.tcc
twitter_clean = db.twitter_clean
to_pandas = twitter_clean.find({}) # Get data from MongoDB
df = pd.DataFrame(list(to_pandas)) # Convert data to Pandas DataFrame
del df['_id'] # Delete column _id
# Tokenize words in sentences and keep in a new column
df['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df['tweet_text']]
# print(df['tokenized_text'].head(10))
# Stemm sentences
for idx, sentence in enumerate(df['tokenized_text']):
df['tokenized_text'][idx] = Stemming(sentence)
# print(df['tokenized_text'].head(10))
def Stemming(sentence): # Function to Stemm words in sentences
stemmer = RSLPStemmer() # to their root form
phrase = []
for word in sentence:
phrase.append(stemmer.stem(word.lower()))
return phrase