def text_preprocessing(sentences: [str]): input_text = list( tc.document(sentences).remove_numbers().remove_stpwrds(). remove_symbols().lower_all()) lema = new_lemmatization(sentences=input_text) return lema
def strip_text(text): smileys = """:-) :) :o) :] :3 :c) :> =] 8) =) :} :^) :D 8-D 8D x-D xD X-D XD =-D =D =-3 =3 B^D""".split(); pattern = "|".join(map(re.escape, smileys)); kaomojis = r'[^0-9A-Za-zぁ-んァ-ン一-龥ovっつ゜ニノ三二]' + '[\(∩ (]' + '[^0-9A-Za-zぁ-んァ-ン一-龥ヲ-゚\)∩ )]' + '[\)∩ )]' + '[^0-9A-Za-zぁ-んァ-ン一-龥ovっつ゜ニノ三二]*' text=text.lower(); text = re.sub(r'((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', '', text); # remove links text = ' '.join([word for word in text.split() if not word.endswith("…")]); # remove ss… text = re.sub(pattern, "", text); text = re.sub(kaomojis, "", text); text = strip_emoji(text); text = re.sub(r'[\)\( \/\.-\][0-9]+[ \)\/\.\(-]', ' ', text); # replace (123)-> 123 text = re.sub(r'\([^()]*\)', '', text); # replace (vishal)-> vishal text = re.sub(r'[.,-_]', ' ', text); # remove . , text = re.sub(r'@\w+ ?', ' ', text); # remove mentions text = text.replace("'s", ""); # replace vishal's->vishal text = re.sub(r'\W+', ' ', text); # replace vishal123@@@-> vishal123 text = re.sub(r'[ ][0-9]+ ', '', text); # remove text = ' '.join([word for word in text.split() if word not in (stopwords.words('english'))]); # remove stopwords text = ' '.join(word for word in tc.document(text).lemming().data); #do lemming text = ' '.join( [w for w in text.split() if len(w)>1] ); # remove single character words a->'' return text;
def btn_click(self, value): self.ui.txt_cleantext.setText('') self.ui.txt_autodetails.setText('') self.ui.txt_manualdetails.setText('') checkboxes = self.ui.gb_detail.findChildren(QtWidgets.QCheckBox) self.ui.txt_autodetails.show() self.ui.txt_cleantext.show() stopwords.words('english') # grup boxtaki checkboxların hepsini items a atadı. text = str(self.ui.txt_rawtext.toPlainText()) for cb in checkboxes: if cb.isChecked(): if cb.text() == "Lower Case": self.ui.txt_cleantext.setText(text.lower()) elif cb.text() == "Upper Case": self.ui.txt_cleantext.setText(text.upper()) elif cb.text() == "Extra Space Remover": self.ui.txt_cleantext.setText(" ".join(text.split())) elif cb.text() == "Remove Punctuations": self.ui.txt_cleantext.setText( text.translate( str.maketrans('', '', string.punctuation))) elif cb.text() == "Number Remover": self.ui.txt_cleantext.setText( text.translate(str.maketrans('', '', string.digits))) elif cb.text() == "Stop Words Remover": data = tc.document(text) a = data.remove_stpwrds() self.ui.txt_cleantext.setText(str(a)) elif cb.text() == "Take Dates": self.ui.txt_manualdetails.show() dates = self.ui.txt_rawtext.toPlainText() matches = datefinder.find_dates(dates) dates_times = "" for match in matches: dates_times = dates_times + str(match) + "\n" if self.ui.txt_manualdetails.toPlainText() != "": self.ui.txt_manualdetails.setText( self.ui.txt_manualdetails.toPlainText() + "\n" + "Dates" + "\n" + dates_times) else: self.ui.txt_manualdetails.setText("Dates" + "\n" + dates_times) elif cb.text() == "Emotion Analysis": self.ui.txt_manualdetails.show() sid = SentimentIntensityAnalyzer() message_text = self.ui.txt_rawtext.toPlainText() scores = sid.polarity_scores(message_text) emotion = (scores.get('compound')) state = '' if (emotion < (-0.1)) & (emotion >= (-0.5)): state = 'Negative' elif (emotion < (-0.5)): state = 'So negative' elif (emotion >= (-0.1)) & (emotion <= (0.1)): state = 'Neutral' elif (emotion > (0.1)) & (emotion < (0.5)): state = 'Positive' elif (emotion >= (0.5)): state = 'So positive' self.ui.txt_manualdetails.setText( self.ui.txt_manualdetails.toPlainText() + "\n" + "Emotion \n" + state) elif cb.text() == "Topic Analysis": self.ui.txt_manualdetails.show() take_subject(self) if self.ui.txt_cleantext.toPlainText() == '': self.ui.txt_cleantext.setText(text) text = self.ui.txt_cleantext.toPlainText() #Most common 5 word self.ui.txt_cleantext.setText(text) b = self.ui.txt_rawtext.toPlainText() b = b.lower() data = tc.document(b) none_stop = data.remove_stpwrds() none_stop = str(none_stop) clean = (none_stop.translate(str.maketrans('', '', string.punctuation))) clean = tc.document(clean) most_common_five = pd.Series( " ".join(clean).split()).value_counts().nlargest(5) most_common_five_vector = most_common_five[0:len(most_common_five)] df = pd.DataFrame(most_common_five, columns=[""]) self.ui.txt_autodetails.setText("Most Common Five Words" + str(df) + "\n") lines = self.ui.txt_rawtext.toPlainText() sentences = nltk.sent_tokenize(lines) verbs = [] #empty to array to hold all verbs for sentence in sentences: for verb, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))): if (pos == 'VB' or pos == 'VBD' or pos == 'VBG' or pos == 'VBN' or pos == 'VBZ' or pos == 'VBP'): verbs.append(verb) most_common_five_verb = (pd.Series( " ".join(verbs).split()).value_counts().nlargest(5)) most_common_five_verb_vector = most_common_five_verb[ 0:len(most_common_five_verb)] df_verb = pd.DataFrame(most_common_five_verb, columns=[""]) text_verb = self.ui.txt_autodetails.toPlainText() text_verb = text_verb + '\n' + "Most Common Five Verbs" + str( df_verb) + "\n" # Add the jar and model via their path (instead of setting environment variables): jar = 'C:/Users/zeyne/OneDrive/Masaüstü/stanford-ner-2018-10-16/stanford-ner.jar' model = 'C:/Users/zeyne/OneDrive/Masaüstü/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz' java_path = "C:/Program Files/Java/jre1.8.0_251/bin/java.exe" os.environ['JAVAHOME'] = java_path st = StanfordNERTagger(model_filename=model, path_to_jar=jar, encoding='utf-8') text = self.ui.txt_rawtext.toPlainText() tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) s = '' for tag, chunk in groupby(classified_text, lambda x: x[1]): if tag != "O": olcak = "%-12s" % tag, " ".join(w for w, t in chunk) s = s + '\n' + str(olcak) a = (str(s).translate(str.maketrans('', '', string.punctuation))) text_verb = text_verb + '\n' + "Persons/Locations/Entities" + a self.ui.txt_autodetails.setText(text_verb)