def preprocessing(conversations): """ Word Stemming and Stop words Removal """ pred_conversations = [] for i in range(len(conversations)): # Remove unique chars conversation = "" for j in range(len(conversations[i])): if ord(conversations[i][j]) < 128: conversation += conversations[i][j] sentence = [] # word tokenize words = word_tokenize(conversation) removal = "?!.,( )" stop_words = set(stopwords.words('english')) stop_words.update(("'s", "n't", "'m", "'ve", "'re", "'d", "'")) for word in words: # Remove ?!., pred_word = "" for j in range(len(word)): if word[j] in removal: continue pred_word += word[j] # Lower, word stemming and stop words removal if len(pred_word) != 0: pred_word = SnowballStemmer("english").stem(pred_word.lower()) if pred_word in stop_words: continue sentence.append(pred_word) pred_conversations.append(sentence) return pred_conversations
def morphy_stem(word): """ Simple stemmer """ #stem = wn.morphy(word) stem = SnowballStemmer('english').stem(word) #stem = lemma.lemmatize(stem2) if stem: return stem.lower() else: return word.lower()
def get_data(name_train, name_test_closed, name_test_open): PATH_TRAINING = name_train PATH_TEST = name_test_open train_set = pd.read_csv(PATH_TRAINING, header=0, sep=',', quotechar='"', names=['autor', 'gender', 'age', 'text']) test_set = pd.read_csv(PATH_TEST, header=0, sep=',', quotechar='"', names=['autor', 'gender', 'age', 'text']) # train_set = train_set.groupby('autor').agg({'gender': 'first', # 'age': 'first', # 'text': ' '.join}).reset_index() punct = set(punctuation) punct_cleaned = set(punctuation.replace("-", "").replace("'", "")) for i in range(len(train_set)): old_text = train_set["text"].iloc[i] # split into sentences sentences = re.split('[.!?]', old_text) sentences_clean = [] # for each sentences for sentence in sentences: sentence_new = [] for word in sentence.split(): if word not in punct_cleaned: word = ''.join(ch for ch in word if ch not in punct_cleaned) word = word.lower() word = SnowballStemmer('english').stem(word) if word.isnumeric(): word = "NUMBER" sentences_clean.append(word) train_set["text"].iloc[i] = ' '.join(sentences_clean) for i in range(len(test_set)): old_text = test_set["text"].iloc[i] # split into sentences sentences = re.split('[.!?]', old_text) sentences_clean = [] # for each sentences for sentence in sentences: sentence_new = [] for word in sentence.split(): if word not in punct_cleaned: word = ''.join(ch for ch in word if ch not in punct_cleaned) word = word.lower() word = SnowballStemmer('english').stem(word) if word.isnumeric(): word = "NUMBER" sentences_clean.append(word) test_set["text"].iloc[i] = ' '.join(sentences_clean) return train_set, test_set