예제 #1
0
class TrainModel:
    def __init__(self):
        self.model_path = r'models/svc_model.pkl'
        self.data_preprocessing = DataPreprocess()
        self.build_features = BuildFeatures()
        self.X = None
        self.y = None

    def run(self):
        # data preprocessing pipeline
        self.data_preprocessing.load_csv()
        self.data_preprocessing.clean_conversation()
        self.data_preprocessing.extract_meaning_phrases()
        self.data_preprocessing.group_convs_by_file_id()
        self.data_preprocessing.rm_dups_phrases_in_same_conv()
        self.X, self.y = self.data_preprocessing.get_X_y()

        # with open('X.pkl', 'rb') as fp:
        #     self.X = pickle.load(fp)
        # self.X = [list(a) for a in self.X]
        #
        # with open('y.pkl', 'rb') as fp:
        #     self.y = pickle.load(fp)

        # Train and test set
        X_train, X_test, y_train, y_test = train_test_split(self.X,
                                                            self.y,
                                                            test_size=0.1,
                                                            stratify=self.y)

        # build features
        # oversampling on training data only
        X_train, y_train = self.build_features.oversampling_on_training_data(
            X_train, y_train)

        # X_train = [' '.join(a).replace('[PAD]', '').strip() for a in X_train]
        # X_test = [' '.join(a).replace('[PAD]', '').strip() for a in X_test]

        # Word to vectors
        self.build_features.word_to_vectors_model(X_train)
        X_train = self.build_features.word_to_vectors_transformed(X_train)
        X_test = self.build_features.word_to_vectors_transformed(X_test)

        # Dimenstion reduction technique.
        self.build_features.dimension_reduction_model(X_train)
        X_train = self.build_features.dimension_reduction_transformed(X_train)
        X_test = self.build_features.dimension_reduction_transformed(X_test)

        # train model
        model = LinearSVC(random_state=25)
        model.fit(X_train, y_train)
        print('\n\n')
        print('-*-' * 20)
        print('Training accuracy: ', model.score(X_train, y_train) * 100)
        print('Accuracy on unseen documents: ',
              model.score(X_test, y_test) * 100)
        print('-*-' * 20)
        pickle.dump(model, open(self.model_path, 'wb'))  # save
예제 #2
0
class Classifier:
    def __init__(self):
        parser = argparse.ArgumentParser(description='List the content of a folder')
        parser.add_argument('--text_file', type=str, help='File path to classify')
        args = parser.parse_args()
        self.text_file_path = args.text_file
        self.df = None
        self.X = None

        self.word_to_vector_model_path = r'models/w2v.pkl'
        self.dim_reduction_path = r'models/dim_reduction.pkl'
        self.model_path = r'models/svc_model.pkl'
        self.data_preprocessing = DataPreprocess()
        self.build_features = BuildFeatures()

    def read_text_file(self):
        with open(self.text_file_path) as fp:
            text = [x.strip('\r\n') for x in fp.readlines()]
            return text

    def create_dataframe(self, text):
        self.df = pd.DataFrame(text, columns=['conversation'])

    def load_model(self):
        return pickle.load(open(self.model_path, 'rb'))


    def run(self):
        text = self.read_text_file()
        self.create_dataframe(text)
        # data preprocessing pipeline
        self.data_preprocessing.test_fill_df(self.df)
        self.data_preprocessing.clean_conversation()
        self.data_preprocessing.extract_meaning_phrases()
        self.data_preprocessing.test_group_convs()
        self.data_preprocessing.rm_dups_phrases_in_same_conv()
        X_test = self.data_preprocessing.test_get_X()
        print(len(X_test))

        # Word to vectors
        X_test = self.build_features.word_to_vectors_transformed(X_test)
        # Dimenstion reduction technique.
        X_test = self.build_features.dimension_reduction_transformed(X_test)

        model = self.load_model()
        print('-*-' * 20)
        predicted_class = model.predict(X_test)
        print('Result: ', predicted_class)