class TrainModel: def __init__(self): self.model_path = r'models/svc_model.pkl' self.data_preprocessing = DataPreprocess() self.build_features = BuildFeatures() self.X = None self.y = None def run(self): # data preprocessing pipeline self.data_preprocessing.load_csv() self.data_preprocessing.clean_conversation() self.data_preprocessing.extract_meaning_phrases() self.data_preprocessing.group_convs_by_file_id() self.data_preprocessing.rm_dups_phrases_in_same_conv() self.X, self.y = self.data_preprocessing.get_X_y() # with open('X.pkl', 'rb') as fp: # self.X = pickle.load(fp) # self.X = [list(a) for a in self.X] # # with open('y.pkl', 'rb') as fp: # self.y = pickle.load(fp) # Train and test set X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.1, stratify=self.y) # build features # oversampling on training data only X_train, y_train = self.build_features.oversampling_on_training_data( X_train, y_train) # X_train = [' '.join(a).replace('[PAD]', '').strip() for a in X_train] # X_test = [' '.join(a).replace('[PAD]', '').strip() for a in X_test] # Word to vectors self.build_features.word_to_vectors_model(X_train) X_train = self.build_features.word_to_vectors_transformed(X_train) X_test = self.build_features.word_to_vectors_transformed(X_test) # Dimenstion reduction technique. self.build_features.dimension_reduction_model(X_train) X_train = self.build_features.dimension_reduction_transformed(X_train) X_test = self.build_features.dimension_reduction_transformed(X_test) # train model model = LinearSVC(random_state=25) model.fit(X_train, y_train) print('\n\n') print('-*-' * 20) print('Training accuracy: ', model.score(X_train, y_train) * 100) print('Accuracy on unseen documents: ', model.score(X_test, y_test) * 100) print('-*-' * 20) pickle.dump(model, open(self.model_path, 'wb')) # save
class Classifier: def __init__(self): parser = argparse.ArgumentParser(description='List the content of a folder') parser.add_argument('--text_file', type=str, help='File path to classify') args = parser.parse_args() self.text_file_path = args.text_file self.df = None self.X = None self.word_to_vector_model_path = r'models/w2v.pkl' self.dim_reduction_path = r'models/dim_reduction.pkl' self.model_path = r'models/svc_model.pkl' self.data_preprocessing = DataPreprocess() self.build_features = BuildFeatures() def read_text_file(self): with open(self.text_file_path) as fp: text = [x.strip('\r\n') for x in fp.readlines()] return text def create_dataframe(self, text): self.df = pd.DataFrame(text, columns=['conversation']) def load_model(self): return pickle.load(open(self.model_path, 'rb')) def run(self): text = self.read_text_file() self.create_dataframe(text) # data preprocessing pipeline self.data_preprocessing.test_fill_df(self.df) self.data_preprocessing.clean_conversation() self.data_preprocessing.extract_meaning_phrases() self.data_preprocessing.test_group_convs() self.data_preprocessing.rm_dups_phrases_in_same_conv() X_test = self.data_preprocessing.test_get_X() print(len(X_test)) # Word to vectors X_test = self.build_features.word_to_vectors_transformed(X_test) # Dimenstion reduction technique. X_test = self.build_features.dimension_reduction_transformed(X_test) model = self.load_model() print('-*-' * 20) predicted_class = model.predict(X_test) print('Result: ', predicted_class)