示例#1
0
 def test_purified_file(self):
     data_df = get_sms_dataset(raw=False)
     assert data_df is not None
     # confirm purified
     for rule_name, matched in TextPurifier(
             texts=data_df['message']).show_iter():
         print(matched)
         assert matched == []
示例#2
0
def kf_test_with_datas(begin,
                       end,
                       test_num,
                       name_scoring="neg_mean_squared_error"):
    data_df = get_sms_dataset(noStopwords=True, overwrite=True)
    X, y = data_df['message'], data_df['target']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=0,
                                                        test_size=0.2)
    cv = CountVectorizer()
    X_train = cv.fit_transform(X_train)
    kf_test_and_draw(X_train, y_train, begin, end, test_num, name_scoring)
示例#3
0
def auto_test_for_other_model(model, param_grid, name_scoring):
    data_df = get_sms_dataset(noStopwords=True, overwrite=True)
    X, y = data_df['message'], data_df['target']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=0,
                                                        test_size=0.2)
    cv = CountVectorizer()
    X_train = cv.fit_transform(X_train)
    grid = gsc(model, param_grid, scoring=name_scoring)
    grid.fit(X_train, y_train)
    print(grid.best_params_, grid.best_score_)
    return grid.best_params_, grid.best_score_
示例#4
0
def kf_test_with_datas_and_draw_alot(begin, end, test_num):
    data_df = get_sms_dataset(noStopwords=True, overwrite=True)
    X, y = data_df['message'], data_df['target']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=0,
                                                        test_size=0.2)
    cv = CountVectorizer()
    X_train = cv.fit_transform(X_train)
    kf_test_and_draw(X_train, y_train, begin, end, test_num, 'accuracy')
    kf_test_and_draw(X_train, y_train, begin, end, test_num, 'precision')
    kf_test_and_draw(X_train, y_train, begin, end, test_num, 'f1')
    kf_test_and_draw(X_train, y_train, begin, end, test_num, 'recall')
示例#5
0
    def __init__(self,
                 sequence_length,
                 transform,
                 cate_type,
                 file_path='data/spam.csv'):
        # get data and transform
        data_df = get_sms_dataset(SMS_DATASET=file_path,
                                  type=cate_type,
                                  noStopwords=False)
        texts, targets = data_df.message.to_list(), data_df.target.to_list()
        self.texts, self.targets, self.word_dict, self.index_dict = transform(
            texts, targets)

        self.sequence_length = sequence_length
        self.words = " ".join(texts).split()

        self.words_indexes = [self.word_dict[w] for w in self.words]
示例#6
0
    def generate_model_no_length(scoring="accuracy"):
        data_df = get_sms_dataset(noStopwords=True, overwrite=True)
        X, y = data_df['message'], data_df['target']

        cv = CountVectorizer()
        X = cv.fit_transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=0,
                                                            test_size=0.2)
        bp = test_tool.auto_test(X_train, y_train, 0.1, 1.5, 200, scoring)

        nb = MultiNB_Wrapper.train_once(X_train, y_train, bp)
        MultiNB_Wrapper.save_model(nb,
                                   cv,
                                   model_name='old_model',
                                   cv_name="old_cv")

        pred = nb.predict(X_test)
        print(metrics.confusion_matrix(y_test, pred))
        return nb
示例#7
0
    def generate_model(scoring="precision"):
        data_df = get_sms_dataset(noStopwords=True, overwrite=True)
        X, y = data_df['message'], data_df['target']

        cv = CountVectorizer()
        X = cv.fit_transform(X)
        lens = data_df['length']
        new_len = csr_matrix(lens)
        X = hstack((X, new_len.reshape(-1, 1)))

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=0,
                                                            test_size=0.2)
        bp = test_tool.auto_test(X_train, y_train, 0.1, 1.5, 200, scoring)

        nb = MultiNB_Wrapper.train_once(X_train, y_train, bp)

        MultiNB_Wrapper.save_model(nb, cv)

        pred = nb.predict(X_test)
        print(metrics.confusion_matrix(y_test, pred))
        return nb
示例#8
0
"""
@Author: peviroy
@Date: 2020-09-03 20:54
@Last Modified by: peviroy
@Last Modified time: 2020-09-06 21:07
"""

import os
os.chdir(os.path.split(os.path.realpath(__file__))[0])
import sys
sys.path.append(os.path.abspath(".."))

from dataset import get_sms_dataset
from utils.preprocessing import TextPurifier

raw_texts = get_sms_dataset(raw=True)['v2'].tolist()
tp = TextPurifier(raw_texts)


class TestTextPurifierInstance:
    def test_str(self):
        assert tp.__str__().startswith('Name:')

    def test_iter(self):
        for rule_name, matched_strings in tp.show_iter():
            assert matched_strings != []

    def test_purify(self):
        purifed_texts = tp.purify()
        tp.set_texts(purifed_texts)
        for rule_name, matched_strings in tp.show_iter():
示例#9
0
 def __init__(self, transform=SMSTransform(), file_path='data/spam.csv'):
     data_df = get_sms_dataset(SMS_DATASET=file_path)
     texts, targets = data_df.message.to_list(), data_df.target.to_list()
     self.texts, self.targets, self.word_dict, self.index_dict = transform(
         texts, targets)
示例#10
0
 def test_raw_file(self):
     assert get_sms_dataset(raw=True) is not None
示例#11
0
 def test_null_raw_file(self):
     try:
         get_sms_dataset(raw=True, SMS_DATASET='null')
     except Exception as e:
         assert isinstance(e, FileNotFoundError)