def text_preprocessing(sentences: [str]):

    input_text = list(
        tc.document(sentences).remove_numbers().remove_stpwrds().
        remove_symbols().lower_all())
    lema = new_lemmatization(sentences=input_text)
    return lema
def strip_text(text):
    smileys = """:-) :) :o) :] :3 :c) :> =] 8) =) :} :^) 
                :D 8-D 8D x-D xD X-D XD =-D =D =-3 =3 B^D""".split();
    pattern = "|".join(map(re.escape, smileys));
    kaomojis = r'[^0-9A-Za-zぁ-んァ-ン一-龥ovっつ゜ニノ三二]' + '[\(∩ (]' + '[^0-9A-Za-zぁ-んァ-ン一-龥ヲ-゚\)∩ )]' + '[\)∩ )]' + '[^0-9A-Za-zぁ-んァ-ン一-龥ovっつ゜ニノ三二]*'
    text=text.lower();
    text = re.sub(r'((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', '', text);  # remove links
    text = ' '.join([word for word in text.split() if not word.endswith("…")]);  # remove ss…
    text = re.sub(pattern, "", text);
    text = re.sub(kaomojis, "", text);
    text = strip_emoji(text);
    text = re.sub(r'[\)\( \/\.-\][0-9]+[ \)\/\.\(-]', ' ', text);  # replace (123)-> 123
    text = re.sub(r'\([^()]*\)', '', text);  # replace (vishal)-> vishal
    text = re.sub(r'[.,-_]', ' ', text);  # remove . ,
    text = re.sub(r'@\w+ ?', ' ', text);  # remove mentions
    text = text.replace("'s", "");  # replace vishal's->vishal
    text = re.sub(r'\W+', ' ', text);  # replace vishal123@@@-> vishal123
    text = re.sub(r'[ ][0-9]+ ', '', text); # remove
    text = ' '.join([word for word in text.split() if word not in (stopwords.words('english'))]);  # remove stopwords
    text = ' '.join(word for word in tc.document(text).lemming().data); #do lemming
    text = ' '.join( [w for w in text.split() if len(w)>1] ); # remove single character words a->''
    return text;
Пример #3
0
    def btn_click(self, value):
        self.ui.txt_cleantext.setText('')
        self.ui.txt_autodetails.setText('')
        self.ui.txt_manualdetails.setText('')
        checkboxes = self.ui.gb_detail.findChildren(QtWidgets.QCheckBox)
        self.ui.txt_autodetails.show()
        self.ui.txt_cleantext.show()
        stopwords.words('english')
        # grup boxtaki checkboxların hepsini items a atadı.
        text = str(self.ui.txt_rawtext.toPlainText())

        for cb in checkboxes:
            if cb.isChecked():
                if cb.text() == "Lower Case":
                    self.ui.txt_cleantext.setText(text.lower())
                elif cb.text() == "Upper Case":
                    self.ui.txt_cleantext.setText(text.upper())
                elif cb.text() == "Extra Space Remover":
                    self.ui.txt_cleantext.setText(" ".join(text.split()))
                elif cb.text() == "Remove Punctuations":
                    self.ui.txt_cleantext.setText(
                        text.translate(
                            str.maketrans('', '', string.punctuation)))
                elif cb.text() == "Number Remover":
                    self.ui.txt_cleantext.setText(
                        text.translate(str.maketrans('', '', string.digits)))
                elif cb.text() == "Stop Words Remover":
                    data = tc.document(text)
                    a = data.remove_stpwrds()
                    self.ui.txt_cleantext.setText(str(a))
                elif cb.text() == "Take Dates":
                    self.ui.txt_manualdetails.show()
                    dates = self.ui.txt_rawtext.toPlainText()
                    matches = datefinder.find_dates(dates)
                    dates_times = ""
                    for match in matches:
                        dates_times = dates_times + str(match) + "\n"
                    if self.ui.txt_manualdetails.toPlainText() != "":
                        self.ui.txt_manualdetails.setText(
                            self.ui.txt_manualdetails.toPlainText() + "\n" +
                            "Dates" + "\n" + dates_times)
                    else:
                        self.ui.txt_manualdetails.setText("Dates" + "\n" +
                                                          dates_times)
                elif cb.text() == "Emotion Analysis":
                    self.ui.txt_manualdetails.show()
                    sid = SentimentIntensityAnalyzer()
                    message_text = self.ui.txt_rawtext.toPlainText()
                    scores = sid.polarity_scores(message_text)
                    emotion = (scores.get('compound'))

                    state = ''
                    if (emotion < (-0.1)) & (emotion >= (-0.5)):
                        state = 'Negative'
                    elif (emotion < (-0.5)):
                        state = 'So negative'
                    elif (emotion >= (-0.1)) & (emotion <= (0.1)):
                        state = 'Neutral'
                    elif (emotion > (0.1)) & (emotion < (0.5)):
                        state = 'Positive'
                    elif (emotion >= (0.5)):
                        state = 'So positive'
                    self.ui.txt_manualdetails.setText(
                        self.ui.txt_manualdetails.toPlainText() + "\n" +
                        "Emotion \n" + state)
                elif cb.text() == "Topic Analysis":
                    self.ui.txt_manualdetails.show()
                    take_subject(self)

        if self.ui.txt_cleantext.toPlainText() == '':
            self.ui.txt_cleantext.setText(text)

        text = self.ui.txt_cleantext.toPlainText()
        #Most common 5 word
        self.ui.txt_cleantext.setText(text)
        b = self.ui.txt_rawtext.toPlainText()
        b = b.lower()
        data = tc.document(b)
        none_stop = data.remove_stpwrds()
        none_stop = str(none_stop)
        clean = (none_stop.translate(str.maketrans('', '',
                                                   string.punctuation)))
        clean = tc.document(clean)
        most_common_five = pd.Series(
            " ".join(clean).split()).value_counts().nlargest(5)
        most_common_five_vector = most_common_five[0:len(most_common_five)]
        df = pd.DataFrame(most_common_five, columns=[""])
        self.ui.txt_autodetails.setText("Most Common Five Words" + str(df) +
                                        "\n")

        lines = self.ui.txt_rawtext.toPlainText()
        sentences = nltk.sent_tokenize(lines)
        verbs = []  #empty to array to hold all verbs

        for sentence in sentences:
            for verb, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
                if (pos == 'VB' or pos == 'VBD' or pos == 'VBG' or pos == 'VBN'
                        or pos == 'VBZ' or pos == 'VBP'):
                    verbs.append(verb)
        most_common_five_verb = (pd.Series(
            " ".join(verbs).split()).value_counts().nlargest(5))
        most_common_five_verb_vector = most_common_five_verb[
            0:len(most_common_five_verb)]
        df_verb = pd.DataFrame(most_common_five_verb, columns=[""])
        text_verb = self.ui.txt_autodetails.toPlainText()
        text_verb = text_verb + '\n' + "Most Common Five Verbs" + str(
            df_verb) + "\n"

        # Add the jar and model via their path (instead of setting environment variables):
        jar = 'C:/Users/zeyne/OneDrive/Masaüstü/stanford-ner-2018-10-16/stanford-ner.jar'
        model = 'C:/Users/zeyne/OneDrive/Masaüstü/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz'

        java_path = "C:/Program Files/Java/jre1.8.0_251/bin/java.exe"
        os.environ['JAVAHOME'] = java_path
        st = StanfordNERTagger(model_filename=model,
                               path_to_jar=jar,
                               encoding='utf-8')
        text = self.ui.txt_rawtext.toPlainText()
        tokenized_text = word_tokenize(text)
        classified_text = st.tag(tokenized_text)
        s = ''
        for tag, chunk in groupby(classified_text, lambda x: x[1]):
            if tag != "O":
                olcak = "%-12s" % tag, " ".join(w for w, t in chunk)
                s = s + '\n' + str(olcak)
        a = (str(s).translate(str.maketrans('', '', string.punctuation)))
        text_verb = text_verb + '\n' + "Persons/Locations/Entities" + a
        self.ui.txt_autodetails.setText(text_verb)