def clean_text(text): """ Removes punctuation, capitalizations, numbers, stop words, and stems words""" ps = PorterStemmer() stop_words = set(stopwords.words('english')) text = text.lower() text = contractions.expandContractions(text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"\'scuse", " excuse ", text) text = re.sub('\W', ' ', text) # remove punctuation text = re.sub('\s+', ' ', text) text = re.sub('\d+', ' ', text) # remove numbers text = re.sub( r'(.)\1\1+', r'\1\1', text) # letters repeated 3 or more times in a row are repeated twice text = re.sub(r'(ha)\1\1+', r'haha', text) text = re.sub(r'(lo)\1\1+', r'lol', text) text = text.strip(' ') # stem words tokenizer = WhitespaceTokenizer() tokenized_comment = tokenizer.tokenize(text) filtered_sentence = [w for w in tokenized_comment if not w in stop_words] stemmed_comment = [ps.stem(word) for word in filtered_sentence] text = " ".join(stemmed_comment) return text
def general_clean_comment(self, comment): comment = comment.lower() comment = expandContractions(comment) comment = self.split_integer_digit_string(comment) comment = self.remove_digits(comment) comment = self.remove_punctuation(comment) tokenized = word_tokenize(comment) stop_word_removed = [] for word in tokenized: if word not in self.general_stop_words: stop_word_removed.append(word) return ' '.join(stop_word_removed)
def classify(document): """ Classify a document with the Hierarchial Attention Network (HAN). :param document: a document in text form :return: pre-processed tokenized document, class scores, attention weights for words, attention weights for sentences, sentence lengths """ # A list to store the document tokenized into words doc = list() # Tokenize document into sentences sentences = list() for paragraph in preprocess(document).splitlines(): sentences.extend([s for s in sent_tokenizer.tokenize(paragraph)]) # Tokenize sentences into words for s in sentences[:sentence_limit]: s1 = expandContractions(s) s2 = ''.join([i for i in s1 if i.isalpha() or i.isspace()]) wakati = mecab.parse(s2) w = word_tokenizer.tokenize(wakati)[:word_limit] if len(w) == 0: continue doc.append(w) # Number of sentences in the document sentences_in_doc = len(doc) sentences_in_doc = torch.LongTensor([sentences_in_doc]).to(device) # (1) # Number of words in each sentence words_in_each_sentence = list(map(lambda s: len(s), doc)) words_in_each_sentence = torch.LongTensor(words_in_each_sentence).unsqueeze(0).to(device) # (1, n_sentences) # Encode document with indices from the word map encoded_doc = list( map(lambda s: list(map(lambda w: word_map.get(w, word_map['<unk>']), s)) + [0] * (word_limit - len(s)), doc)) + [[0] * word_limit] * (sentence_limit - len(doc)) encoded_doc = torch.LongTensor(encoded_doc).unsqueeze(0).to(device) # Apply the HAN model scores, word_alphas, sentence_alphas = model(encoded_doc, sentences_in_doc, words_in_each_sentence) # (1, n_classes), (1, n_sentences, max_sent_len_in_document), (1, n_sentences) scores = scores.squeeze(0) # (n_classes) scores = nn.functional.softmax(scores, dim=0) # (n_classes) word_alphas = word_alphas.squeeze(0) # (n_sentences, max_sent_len_in_document) sentence_alphas = sentence_alphas.squeeze(0) # (n_sentences) words_in_each_sentence = words_in_each_sentence.squeeze(0) # (n_sentences) return doc, scores, word_alphas, sentence_alphas, words_in_each_sentence
def process_text(text): text = hashtags.sub(' hashtag', text) text = mentions.sub(' entity', text) text = urls.sub(' website', text) text = re.sub(r"[^A-Za-z0-9(),!.?\'\`]", " ", text) text = re.sub(r",", " ", text) text = re.sub(r"\d+", "", text) text = re.sub(r":", " ", text) text = re.sub(r"-", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ", text) text = re.sub(r"\(", " ( ", text) text = re.sub(r"\)", " ) ", text) text = re.sub(r"\?", " ", text) text = re.sub(r"\s{2,}", " ", text) text = text.split() text = [contractions.expandContractions(x) for x in text] text = sp(' '.join([word for word in text if not word in stop_words])) text = ' '.join([word.lemma_ for word in text]) return text.strip().lower()
# In[2]: text = pd.read_csv('train.csv').drop(['Complaint-Status'], axis = 1) text = text.append(pd.read_csv('test.csv'), ignore_index = True) '''text['word count'] = text['Consumer-complaint-summary'].apply(lambda x : len(str(x).split(' '))) text['char_len'] = text['Consumer-complaint-summary'].apply(lambda x : len(str(x))) def avg_word(sentence): words = sentence.split() return (sum(len(word) for word in words)/len(words)) text['avg_word'] = text['Consumer-complaint-summary'].apply(lambda x: avg_word(str(x))) word_features = text.iloc[:, -3:].values np.savetxt('word_features.txt', word_features)''' text['Consumer-Complaint-summary'] = text['Consumer-complaint-summary'].apply(lambda x: expandContractions(x)) text['Consumer-complaint-summary'] = text['Consumer-complaint-summary'].apply(lambda x: re.sub('[~`!@#$%^&*():;"{}_/?><\|.,`0-9]', '', x.replace('-', ' '))) #text['Consumer-complaint-summary'] = text['Consumer-complaint-summary'].apply(lambda x: unidecode.unidecode(x)) text = text['Consumer-complaint-summary'].iloc[:].values # In[3]: # detecting the corresponding languages of summary """!pip install langdetect from langdetect import detect languages = []