Пример #1
0
    def test_process_thai_sparse(self):
        text = "👍👍👍 #AnA มากกกก น้อยน้อย ().1146"

        actual = process_thai(text)

        # after pre_rules_th_sparse
        # >>> "👍👍👍 # Ana มาก xxrep  น้้อยน้อย .1146"
        #
        # after tokenize with word_tokenize(engine="newmm")
        # >>> ["👍👍👍", " ", "#", " ","Ana", " ", "มาก", "xxrep",
        #      "  ", "น้อย", "น้อย", " ", ".", "1146"]
        #
        # after post_rules_th
        # - remove whitespace token (" ")
        # >>> ["xxwrep, "👍", "#", "ana", "มาก",
        #       "xxrep", "xxwrep", "น้อย", ".", "1146"]

        expect = [
            "xxwrep",
            "👍",
            "#",
            "ana",
            "มาก",
            "xxrep",
            "xxwrep",
            "น้อย",
            ".",
            "1146",
        ]

        self.assertEqual(actual, expect)
Пример #2
0
def predictbyid():
    if request.method == 'POST':
        # get user id from text box
        user = request.form['userID']
        # get tweets with user id
        posts = getTweetFromUser(user)
        if posts.shape[0] != 0:
            # words processing
            posts["processed"] = posts.Tweets.map(
                lambda x: "|".join(process_thai(x)))
            posts["wc"] = posts.processed.map(lambda x: len(x.split("|")))
            posts["uwc"] = posts.processed.map(
                lambda x: len(set(x.split("|"))))
            tf_input = tfidf_fit.transform(posts["Tweets"])
            num_input = scaler_fit.transform(posts[["wc",
                                                    "uwc"]].astype(float))
            t_input = np.concatenate([num_input, tf_input.toarray()], axis=1)

            # predict and convert output to list
            result = predict(t_input, posts)
            return render_template('output.html',
                                   output_result=result,
                                   length=len(result))
        else:
            return render_template('output.html', length=0)
    return render_template('predictById.html')
Пример #3
0
    def test_process_thai_2(self):
        """rules for dense features"""

        text = "👍👍👍 #AnA มากกกก น้อยน้อย ().1146"

        actual = process_thai(text,
                              pre_rules=pre_rules_th,
                              post_rules=post_rules_th,
                              tok_func=_pythainlp_tokenizer.word_tokenize)

        # after pre_rules_th
        # >>> "👍👍👍 # Ana มากxxrep4 น้้อยน้อย .1146"
        #
        # after tokenize with word_tokenize(engine="newmm")
        # >>> ["👍👍👍", " ", "#", "Ana", " ", "มาก", "xxrep", "4",
        #             " ", "น้อย", "น้อย", " ", ".", "1146"]
        # after post_rules_th
        # -- because it performs `replace_wrep_post` before `ungroup_emoji`,
        #    3 repetitive emoji are not marked with special token "xxwrep num"
        #
        # >>> ["👍", "👍","👍", " ", "#", "ana", " ", "มาก",
        #       "xxrep", "4", " ", "xxwrep", "1", "น้อย", " ",
        #       ".", "1146"]

        expect = [
            "👍", "👍", "👍", " ", "#", " ", "ana", " ", "มาก", "xxrep", "4", " ",
            "xxwrep", "1", "น้อย", " ", ".", "1146"
        ]

        self.assertEqual(actual, expect)
Пример #4
0
def predictbysentence():
    result = ""
    if request.method == 'POST':
        # get sentense from text box
        text = request.form['texts']
        # clean text
        texts = cleanText(text)
        # create and put data into dataframe
        posts = pd.DataFrame({"texts": [texts]})
        if posts.shape[0] != 0:
            # words processing
            posts["processed"] = posts.texts.map(
                lambda x: "|".join(process_thai(x)))
            posts["wc"] = posts.processed.map(lambda x: len(x.split("|")))
            posts["uwc"] = posts.processed.map(
                lambda x: len(set(x.split("|"))))
            tf_input = tfidf_fit.transform(posts["texts"])
            num_input = scaler_fit.transform(posts[["wc",
                                                    "uwc"]].astype(float))
            t_input = np.concatenate([num_input, tf_input.toarray()], axis=1)

            # predict
            output_pd = pd.DataFrame(model.predict_proba(t_input))
            output_pd.columns = model.classes_
            output = model.predict(t_input)
            # replace output with word
            if output == "neg":
                result = "Negative"
            elif output == "pos":
                result = "Positive"
            elif output == "neu":
                result = "Neutral"
            return render_template('outputBySentence.html',
                                   result=result,
                                   text=texts)
        else:
            return render_template('outputBySentence.html', length=0)
    return render_template('predictBySentence.html')