def train_model_cv(self, train_file, normalize, is_bool_value, is_percentage, cv=10, save_model=False): # training self.logger.info("Training Model") features_array, label_array, feature_names = self.get_features_array_label_array_from_file( train_file, normalize=normalize, is_bool_value=is_bool_value, is_percentage=is_percentage) # TODO: you can change the model here. Now we are using 10-cross valication for the model. # self.model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) # self.model = linear_model.Lasso(alpha = 0.1) self.model = linear_model.LassoCV(cv=cv, normalize=False, verbose=True, max_iter=10000) print("Model Settings:", self.model) self.model.fit(features_array, label_array) # TODO: Save the model to pickle. Uncommend the following two lines if you want to save the model. if save_model == True: Pickle_Helper.save_model_to_pickle(self.model, self.dump_model_fname) Pickle_Helper.save_model_to_pickle(self.standard_scaler, self.dump_standard_scaler_fname) self.print_linear_regression_formular(feature_names)
def load_model_if_exists(self, dump_model_dir=config.PREROCESS_PICKLES_DIR): # Load the file is not already done so. If there is no pickle created, train one for it. self.dump_model_dir = dump_model_dir if not os.path.exists(dump_model_dir): os.makedirs(dump_model_dir) self.generate_model_name() self.dump_standard_scaler_fname = os.path.join( dump_model_dir, "{}_standard_scaler.pickle".format(self.model_name)) self.dump_one_hot_encode_fname = os.path.join( dump_model_dir, "{}_onehot_encoder.pickle".format(self.model_name)) self.dump_dictionary_fname = os.path.join( dump_model_dir, "{}_dictionary.pickle".format(self.model_name)) self.dump_counter_vec_fname = os.path.join( dump_model_dir, "{}_countvec.pickle".format(self.model_name)) # self.dump_label_encoder_fname = os.path.join(dump_model_dir, "{}_label.pickle".format(self.model_name)) self.dump_vocab_processor_fname = os.path.join( dump_model_dir, "{}_embedding.pickle".format(self.model_name)) if self.replace_exists is not True: self.standard_scaler = Pickle_Helper.load_model_from_pickle( self.dump_standard_scaler_fname) self.one_hot_encoder = Pickle_Helper.load_model_from_pickle( self.dump_one_hot_encode_fname) self.dictionary = Pickle_Helper.load_model_from_pickle( self.dump_dictionary_fname) self.counter_vector = Pickle_Helper.load_model_from_pickle( self.dump_counter_vec_fname) # self.label_encoder = Pickle_Helper.load_model_from_pickle(self.dump_label_encoder_fname) self.vocab_processor = Pickle_Helper.load_model_from_pickle( self.dump_vocab_processor_fname)
def load_data_if_exists(self, dump_model_dir=config.PREROCESS_PICKLES_DIR): # Load the file is not already done so. If there is no pickle created, train one for it. self.dump_model_dir = dump_model_dir if not os.path.exists(dump_model_dir): os.makedirs(dump_model_dir) self.generate_model_name() self.logger.info("Load data model {}".format(self.model_name)) self.dump_X_train_fname = os.path.join(dump_model_dir, "{}_X_train.pickle".format(self.model_name)) self.dump_y_train_fname = os.path.join(dump_model_dir, "{}_y_train.pickle".format(self.model_name)) self.dump_X_test_fname = os.path.join(dump_model_dir, "{}_X_test.pickle".format(self.model_name)) self.dump_y_test_fname = os.path.join(dump_model_dir, "{}_y_test.pickle".format(self.model_name)) if self.replace_exists == False: self.X_train = Pickle_Helper.load_model_from_pickle(self.dump_X_train_fname) self.y_train = Pickle_Helper.load_model_from_pickle(self.dump_y_train_fname) self.X_test = Pickle_Helper.load_model_from_pickle(self.dump_X_test_fname) self.y_test = Pickle_Helper.load_model_from_pickle(self.dump_y_test_fname) self.dump_kfold_fname = os.path.join(dump_model_dir, "{}_kfold.pickle".format(self.model_name)) if self.replace_exists == False: self.kfold = Pickle_Helper.load_model_from_pickle(self.dump_kfold_fname)
def load_model(self, train_file, is_bool_value, standardize): # Load the file is not already done so. If there is no pickle created, train one for it. self.logger.info("Load Model") if self.model is None: self.model = Pickle_Helper.load_model_from_pickle( self.dump_model_fname) self.dictionary = Pickle_Helper.load_model_from_pickle( self.dump_dictionary_fname) self.standard_scaler = Pickle_Helper.load_model_from_pickle( self.dump_standard_scaler_fname) if self.model is None: self.train_model(train_file, is_bool_value, standardize)
def load_model(self): # Load the file is not already done so. If there is no pickle created, train one for it. self.logger.info("Load Model") if self.model is None: self.model = Pickle_Helper.load_model_from_pickle( self.dump_model_fname) self.dictionary = Pickle_Helper.load_model_from_pickle( self.dump_dictionary_fname) self.label_encoder = Pickle_Helper.load_model_from_pickle( self.dump_label_encoder_fname) if self.model is None: self.train_model(config.WASHINGTON_TOPIC_DATA)
def load_feature_bin_vector_model(self): self.dump_catbinvector_fname = os.path.join( self.dump_model_dir, "{}_embd_feavector.pickle".format(self.model_name)) if self.replace_exists is False: self.custom_feature_vector = Pickle_Helper.load_model_from_pickle( self.dump_catbinvector_fname) if self.custom_feature_vector is None: start = datetime.datetime.now() end = datetime.datetime.now() feature = self.custom_feature_list self.custom_feature_vector = CountVectorizer( binary=self.custom_feature_binary) self.custom_feature_vector.fit(feature) self.logger.info("It takes {}s to load {} features.".format( (end - start).total_seconds(), len(self.custom_feature_vector.vocabulary_))) self.embedding_vector_dimension = len(self.custom_feature_list) self.logger.info( "The actual embedding_vector_dimension is {}".format( self.embedding_vector_dimension)) self.store_feature_bin_vector()
def store_data(self, replace_exists=False): if not os.path.exists(self.dump_X_train_fname) or replace_exists is True: if self.X_train is not None: Pickle_Helper.save_model_to_pickle(self.X_train, self.dump_X_train_fname) if not os.path.exists(self.dump_y_train_fname) or replace_exists is True: if self.y_train is not None: Pickle_Helper.save_model_to_pickle(self.y_train, self.dump_y_train_fname) if not os.path.exists(self.dump_X_test_fname) or replace_exists is True: if self.X_test is not None: Pickle_Helper.save_model_to_pickle(self.X_test, self.dump_X_test_fname) if not os.path.exists(self.dump_y_test_fname) or replace_exists is True: if self.y_test is not None: Pickle_Helper.save_model_to_pickle(self.y_test, self.dump_y_test_fname) if not os.path.exists(self.dump_kfold_fname) or replace_exists is True: if self.kfold is not None: Pickle_Helper.save_model_to_pickle(self.kfold, self.dump_kfold_fname)
def load_model(self, train_file, test_size, cv, normalize, is_bool_value, is_percentage): # Load the file is not already done so. If there is no pickle created, train one for it. self.logger.info("Load Model") if self.model is None: self.model = Pickle_Helper.load_model_from_pickle( self.dump_model_fname) self.standard_scaler = Pickle_Helper.load_model_from_pickle( self.dump_standard_scaler_fname) if self.model is None: if test_size is None: # Cross validation self.train_model_cv(train_file, normalize, is_bool_value, is_percentage, cv) else: # Otherwise self.train_model(train_file, normalize, is_bool_value, is_percentage)
def load_init_model(self): if not os.path.exists(self.dump_model_dir): os.makedirs(self.dump_model_dir) self.generate_model_name() # Load the file is not already done so. If there is no pickle created, train one for it. self.logger.info("Load Model {}".format(self.model_name)) self.dump_tokenizer_fname = os.path.join( self.dump_model_dir, "{}_tokenizer.pickle".format(self.model_name)) self.dump_embmatrix_fname = os.path.join( self.dump_model_dir, "{}_embmatrix.pickle".format(self.model_name)) if self.replace_exists is False: self.tokenizer = Pickle_Helper.load_model_from_pickle( self.dump_tokenizer_fname) self.embedding_matrix = Pickle_Helper.load_model_from_pickle( self.dump_embmatrix_fname)
def train_model(self, train_file, normalize, is_bool_value, is_percentage, test_size=0.10, alpha=0.1, save_model=False): # training self.logger.info("Training Model") features_array, label_array, feature_names = self.get_features_array_label_array_from_file( train_file, normalize=normalize, is_bool_value=is_bool_value, is_percentage=is_percentage) X_train, X_test, y_train, y_test = train_test_split( features_array, label_array, test_size=test_size) # TODO: you can change the model here. Now we aplit the training set and test set. # self.model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) # self.model = linear_model.LassoCV(cv=10, normalize=False, verbose=True, max_iter=10000) # We could use the best alpha learnt from cross validation. self.model = linear_model.Lasso(alpha=alpha) print("Model Settings:", self.model) self.logger.info("Training Model") self.model.fit(X_train, y_train) score = self.model.score(X_test, y_test) print("R score", score) y_predict = self.model.predict(X_test) regression_model_mse = mean_squared_error(y_predict, y_test) print("alpha", self.model.alpha) print("mse", regression_model_mse) if save_model == True: Pickle_Helper.save_model_to_pickle(self.model, self.dump_model_fname) Pickle_Helper.save_model_to_pickle(self.standard_scaler, self.dump_standard_scaler_fname) self.print_linear_regression_formular(feature_names)
def load_label_model(self, dump_model_dir=config.PREROCESS_PICKLES_DIR): # Load the file is not already done so. If there is no pickle created, train one for it. self.dump_model_dir = dump_model_dir if not os.path.exists(dump_model_dir): os.makedirs(dump_model_dir) self.dump_label_encoder_fname = os.path.join( dump_model_dir, "{}_label.pickle".format(self.data_lable_name)) self.label_encoder = Pickle_Helper.load_model_from_pickle( self.dump_label_encoder_fname) print("load label", self.dump_label_encoder_fname)
def get_X_y_featurenames_from_pickle(self, filename, feature_columns:list=None, label_colnames:list=None, drop_colnames:list=None): df = Pickle_Helper.load_model_from_pickle(pickle_fname=filename) return self.get_X_y_featurenames_from_dateframe( df, feature_columns=feature_columns, label_colnames=label_colnames, drop_colnames=drop_colnames )
def train_model(self, train_file, is_bool_value=False, standardize=False): # training self.logger.info("Get Features") features_array, label_array = self.get_features_array_label_array_from_file( train_file, is_training=True, is_bool_value=is_bool_value, standardize=standardize) # TODO: check about the different parameters. # self.model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) # self.model = linear_model.Lasso(alpha = 0.1) self.model = linear_model.LassoCV(cv=10, normalize=False, verbose=True, n_jobs=2) print(self.model) self.logger.info("Training Model") self.model.fit(features_array, label_array) Pickle_Helper.save_model_to_pickle(self.model, self.dump_model_fname) Pickle_Helper.save_model_to_pickle(self.dictionary, self.dump_dictionary_fname) Pickle_Helper.save_model_to_pickle(self.standard_scaler, self.dump_standard_scaler_fname) self.print_linear_regression_formular()
def load_model_if_exists(self, classifier_name="general", preprocess_name="general", dump_model_dir=config.PREROCESS_PICKLES_DIR): # Load the file is not already done so. If there is no pickle created, train one for it. self.logger.info("Load Model") self.dump_model_dir = dump_model_dir if not os.path.exists(dump_model_dir): os.makedirs(dump_model_dir) self.model_name = "{}_{}".format(classifier_name, preprocess_name) self.dump_model_fname = os.path.join( dump_model_dir, "{}.pickle".format(self.model_name)) self.model = Pickle_Helper.load_model_from_pickle( self.dump_model_fname)
def train_model(self, train_file): # training self.logger.info("Training Model") features_array, label_array = self.get_features_array_label_array_from_file( train_file, is_training=True) self.model = LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced') self.model.fit(features_array, label_array) Pickle_Helper.save_model_to_pickle(self.model, self.dump_model_fname) Pickle_Helper.save_model_to_pickle(self.dictionary, self.dump_dictionary_fname) Pickle_Helper.save_model_to_pickle(self.label_encoder, self.dump_label_encoder_fname)
def store_lable_model(self, replace_exists=False): if not os.path.exists( self.dump_label_encoder_fname) or replace_exists is True: if self.label_encoder is not None: Pickle_Helper.save_model_to_pickle( self.label_encoder, self.dump_label_encoder_fname)
def store_model(self, replace_exists=False): if not os.path.exists( self.dump_standard_scaler_fname) or replace_exists is True: if self.standard_scaler is not None: Pickle_Helper.save_model_to_pickle( self.standard_scaler, self.dump_standard_scaler_fname) if not os.path.exists( self.dump_one_hot_encode_fname) or replace_exists is True: if self.one_hot_encoder is not None: Pickle_Helper.save_model_to_pickle( self.one_hot_encoder, self.dump_one_hot_encode_fname) if not os.path.exists( self.dump_dictionary_fname) or replace_exists is True: if self.dictionary is not None: Pickle_Helper.save_model_to_pickle(self.dictionary, self.dump_dictionary_fname) if not os.path.exists( self.dump_counter_vec_fname) or replace_exists is True: if self.counter_vector is not None: Pickle_Helper.save_model_to_pickle(self.counter_vector, self.dump_counter_vec_fname) if not os.path.exists( self.dump_label_encoder_fname) or replace_exists is True: if self.label_encoder is not None: Pickle_Helper.save_model_to_pickle( self.label_encoder, self.dump_label_encoder_fname) if not os.path.exists( self.dump_vocab_processor_fname) or replace_exists is True: if self.vocab_processor is not None: Pickle_Helper.save_model_to_pickle( self.vocab_processor, self.dump_vocab_processor_fname)
def store_tokenzier(self, replace_exists=False): if not os.path.exists( self.dump_tokenizer_fname) or replace_exists is True: if self.tokenizer is not None: Pickle_Helper.save_model_to_pickle(self.tokenizer, self.dump_tokenizer_fname)
def store_embedding_matrix(self, replace_exists=False): if not os.path.exists( self.dump_embmatrix_fname) or replace_exists is True: if self.embedding_matrix is not None: Pickle_Helper.save_model_to_pickle(self.embedding_matrix, self.dump_embmatrix_fname)
def store_feature_bin_vector(self): if not os.path.exists( self.dump_catbinvector_fname) or self.replace_exists is True: if self.custom_feature_vector is not None: Pickle_Helper.save_model_to_pickle( self.custom_feature_vector, self.dump_catbinvector_fname)
def store_model_if_not_exits(self, replace_exists=False): if not os.path.exists(self.dump_model_fname) or replace_exists is True: Pickle_Helper.save_model_to_pickle(self.model, self.dump_model_fname)
def get_df_from_pickle(self, filename): ret = Pickle_Helper.load_model_from_pickle(filename) df = pd.DataFrame(ret) # print("columns", df.columns.values) return df