def test_fs_permute(cvs, X_test1, y_test1, cluster_dir): logger = logging.getLogger('log_rbf_cnn_test.log') logger.setLevel(logging.INFO) handler = logging.FileHandler( os.path.join(cluster_dir, 'log_rbf_cnn_test.log'), 'a') handler.setLevel(logging.INFO) # create a logging format formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # add the handlers to the logger logger.addHandler(handler) rated = None static_data = write_database() logger.info('Permutation Evaluation') logger.info('/n') method = 'svm' model_sklearn = sklearn_model(cluster_dir, rated, method, static_data['sklearn']['njobs']) model_sklearn.train(cvs) pred = model_sklearn.predict(X_test1) metrics_svm = model_sklearn.compute_metrics(pred, y_test1, rated) logger.info('before feature selection metrics') logger.info('sse, %s rms %s, mae %s, mse %s', *metrics_svm) fs = FS(cluster_dir, static_data['sklearn']['njobs']) features = fs.fit(cvs) logger.info('Number of variables %s', str(features.shape[0])) for i in range(3): cvs[i][0] = cvs[i][0][:, features] cvs[i][2] = cvs[i][2][:, features] cvs[i][4] = cvs[i][4][:, features] model_sklearn = sklearn_model(cluster_dir, rated, method, static_data['sklearn']['njobs']) model_sklearn.train(cvs) pred = model_sklearn.predict(X_test1[:, features]) metrics_svm = model_sklearn.compute_metrics(pred, y_test1, rated) logger.info('After feature selection metrics') logger.info('sse, %s rms %s, mae %s, mse %s', *metrics_svm)
def predict(self, X): X_pred = np.array([]) if not hasattr(self, 'best_methods'): self.best_methods = X.keys() for method in sorted(self.best_methods): if X_pred.shape[0] == 0: X_pred = X[method] else: X_pred = np.hstack((X_pred, X[method])) X_pred /= 20 if not hasattr(self, 'model'): raise ValueError('The combine models does not exist') pred_combine = dict() for combine_method in self.combine_methods: if combine_method == 'rls': pred = np.matmul(self.model[combine_method]['w'], X_pred) elif combine_method == 'bcp': pred = np.matmul(self.model[combine_method]['w'], X_pred) elif combine_method == 'mlp': self.model[combine_method] = sklearn_model( self.model_dir, self.rated, 'mlp', self.n_jobs) pred = self.model[combine_method].predict(X_pred) elif combine_method == 'bayesian_ridge': self.model[combine_method] = BayesianRidge(fit_intercept=False) pred = self.model[combine_method].predict(X_pred) elif combine_method == 'elastic_net': self.model[combine_method] = ElasticNetCV(cv=5, fit_intercept=False) pred = self.model[combine_method].predict(X_pred) elif combine_method == 'ridge': self.model[combine_method] = RidgeCV(cv=5, fit_intercept=False) pred = self.model[combine_method].predict(X_pred) elif combine_method == 'isotonic': self.model[combine_method] = IsotonicRegression(y_min=0, y_max=1) pred = self.model[combine_method].predict(X_pred) else: pred = np.mean(X_pred, axis=1).reshape(-1, 1) pred_combine[combine_method] = 20 * pred return pred_combine
def train(self, X_test, y_test, act_test, X_cnn_test, X_lstm_test): if X_test.shape[0] > 0 and len(self.methods) > 1: if self.model_type in {'pv', 'wind'}: if self.resampling == True: pred_resample, y_resample, results = self.resampling_for_combine( X_test, y_test, act_test, X_cnn_test, X_lstm_test) else: pred_resample, y_resample, results = self.without_resampling( X_test, y_test, act_test, X_cnn_test, X_lstm_test) elif self.model_type in {'load'}: if self.resampling == True: pred_resample, y_resample, results = self.resampling_for_combine( X_test, y_test, act_test, X_cnn_test, X_lstm_test) else: pred_resample, y_resample, results = self.without_resampling( X_test, y_test, act_test, X_cnn_test, X_lstm_test) elif self.model_type in {'fa'}: if self.resampling == True: pred_resample, y_resample, results = self.resampling_for_combine( X_test, y_test, act_test, X_cnn_test, X_lstm_test) else: pred_resample, y_resample, results = self.without_resampling( X_test, y_test, act_test, X_cnn_test, X_lstm_test) self.best_methods = results.nsmallest(4, 'mae').index.tolist() results = results.loc[self.best_methods] results['diff'] = results['mae'] - results['mae'].iloc[0] best_of_best = results.iloc[np.where( results['diff'] <= 0.01)].index.tolist() if len(best_of_best) == 1: best_of_best.append(self.best_methods[1]) self.best_methods = best_of_best X_pred = np.array([]) for method in sorted(self.best_methods): if X_pred.shape[0] == 0: X_pred = pred_resample[method] else: X_pred = np.hstack((X_pred, pred_resample[method])) X_pred /= 20 X_pred[np.where(X_pred < 0)] = 0 y_resample /= 20 X_pred, y_resample = shuffle(X_pred, y_resample) self.weight_size = len(self.best_methods) self.model = dict() for combine_method in self.combine_methods: if combine_method == 'rls': self.logger.info('RLS training') self.logger.info('/n') self.model[combine_method] = dict() w = self.rls_fit(X_pred, y_resample) self.model[combine_method]['w'] = w elif combine_method == 'bcp': self.logger.info('BCP training') self.logger.info('/n') self.model[combine_method] = dict() w = self.bcp_fit(X_pred, y_resample) self.model[combine_method]['w'] = w elif combine_method == 'mlp': self.logger.info('MLP training') self.logger.info('/n') cvs = [] for _ in range(3): X_train, X_test1, y_train, y_test1 = train_test_split( X_pred, y_resample, test_size=0.15) X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.15) cvs.append( [X_train, y_train, X_val, y_val, X_test1, y_test1]) mlp_model = sklearn_model(self.model_dir, self.rated, 'mlp', self.n_jobs, is_combine=True) self.model[combine_method] = mlp_model.train(cvs) elif combine_method == 'bayesian_ridge': self.logger.info('bayesian_ridge training') self.logger.info('/n') self.model[combine_method] = BayesianRidge() self.model[combine_method].fit(X_pred, y_resample) elif combine_method == 'elastic_net': self.logger.info('elastic_net training') self.logger.info('/n') self.model[combine_method] = ElasticNetCV(cv=5) self.model[combine_method].fit(X_pred, y_resample) elif combine_method == 'ridge': self.logger.info('ridge training') self.logger.info('/n') self.model[combine_method] = RidgeCV(cv=5) self.model[combine_method].fit(X_pred, y_resample) self.logger.info('End of combine models training') else: self.combine_methods = ['average'] self.istrained = True self.save(self.model_dir)
def fit_model(self, cvs, method, static_data, cluster_dir, optimize_method, X_cnn=np.array([]), X_lstm=np.array([]), y=np.array([]), rated=1): # deap, optuna, skopt, grid_search if optimize_method == 'deap': from Fuzzy_clustering.ver_tf2.Sklearn_models_deap import sklearn_model elif optimize_method == 'optuna': from Fuzzy_clustering.ver_tf2.Sklearn_models_optuna import sklearn_model elif optimize_method == 'skopt': from Fuzzy_clustering.ver_tf2.Sklearn_models_skopt import sklearn_model else: from Fuzzy_clustering.ver_tf2.SKlearn_models import sklearn_model # if (datetime.now().hour>=8 and datetime.now().hour<10): # time.sleep(2*60*60) if method == 'ML_RBF_ALL': model_rbf = rbf_model(static_data['RBF'], rated, cluster_dir) model_rbf_ols = rbf_ols_module(cluster_dir, rated, static_data['sklearn']['njobs'], GA=False) model_rbf_ga = rbf_ols_module(cluster_dir, rated, static_data['sklearn']['njobs'], GA=True) if model_rbf_ols.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of model_rbf_ols') self.models['RBF_OLS'] = model_rbf_ols.optimize_rbf(cvs) else: self.models['RBF_OLS'] = model_rbf_ols.to_dict() if model_rbf_ga.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of model_rbf_ga') self.models['GA_RBF_OLS'] = model_rbf_ga.optimize_rbf(cvs) else: self.models['GA_RBF_OLS'] = model_rbf_ga.to_dict() if model_rbf.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of model_rbf_adam') self.models['RBFNN'] = model_rbf.rbf_train(cvs) else: self.models['RBFNN'] = model_rbf.to_dict() elif method == 'ML_RBF_ALL_CNN': model_rbf = rbf_model(static_data['RBF'], rated, cluster_dir) model_rbf_ols = rbf_ols_module(cluster_dir, rated, static_data['sklearn']['njobs'], GA=False) model_rbf_ga = rbf_ols_module(cluster_dir, rated, static_data['sklearn']['njobs'], GA=True) if model_rbf_ols.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of model_rbf_ols') self.models['RBF_OLS'] = model_rbf_ols.optimize_rbf(cvs) else: self.models['RBF_OLS'] = model_rbf_ols.to_dict() if model_rbf_ga.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of model_rbf_ga') self.models['GA_RBF_OLS'] = model_rbf_ga.optimize_rbf(cvs) else: self.models['GA_RBF_OLS'] = model_rbf_ga.to_dict() if model_rbf.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of model_rbf_adam') self.models['RBFNN'] = model_rbf.rbf_train(cvs) else: self.models['RBFNN'] = model_rbf.to_dict() rbf_dir = [ model_rbf_ols.cluster_dir, model_rbf_ga.cluster_dir, model_rbf.cluster_dir ] model_cnn = cnn_model(static_data, rated, cluster_dir, rbf_dir) if model_cnn.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of model_cnn') self.models['RBF-CNN'] = model_cnn.train_cnn(cvs) else: self.models['RBF-CNN'] = model_cnn.to_dict() elif method == 'ML_NUSVM': method = method.replace('ML_', '') model_sklearn = sklearn_model(cluster_dir, rated, method, static_data['sklearn']['njobs']) if model_sklearn.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of NUSVM') self.models['NUSVM'] = model_sklearn.train(cvs) else: self.models['NUSVM'] = model_sklearn.to_dict() elif method == 'ML_MLP': method = method.replace('ML_', '') model_sklearn = sklearn_model(cluster_dir, rated, method, static_data['sklearn']['njobs']) if model_sklearn.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of MLP') self.models['MLP'] = model_sklearn.train(cvs) else: self.models['MLP'] = model_sklearn.to_dict() elif method == 'ML_SVM': method = method.replace('ML_', '') model_sklearn = sklearn_model(cluster_dir, rated, method, static_data['sklearn']['njobs']) if model_sklearn.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of SVM') self.models['SVM'] = model_sklearn.train(cvs) else: self.models['SVM'] = model_sklearn.to_dict() elif method == 'ML_RF': method = method.replace('ML_', '') model_sklearn = sklearn_model(cluster_dir, rated, method, static_data['sklearn']['njobs']) if model_sklearn.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of RF') self.models['RF'] = model_sklearn.train(cvs) else: self.models['RF'] = model_sklearn.to_dict() elif method == 'ML_XGB': method = method.replace('ML_', '') model_sklearn = sklearn_model(cluster_dir, rated, method, static_data['sklearn']['njobs']) if model_sklearn.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of XGB') self.models['XGB'] = model_sklearn.train(cvs) else: self.models['XGB'] = model_sklearn.to_dict() elif method == 'ML_CNN_3d': cnn_model_3d = cnn_3d_model(static_data, rated, cluster_dir) if cnn_model_3d.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of CNN_3d') self.models['CNN_3d'] = cnn_model_3d.train_cnn(X_cnn, y) else: self.models['CNN_3d'] = cnn_model_3d.to_dict() elif method == 'ML_LSTM_3d': lstm_model_3d = lstm_3d_model(static_data, rated, cluster_dir) if lstm_model_3d.istrained == False or static_data[ 'train_online'] == True: self.logger.info('Start of training of LSTM_3d') self.models['LSTM_3d'] = lstm_model_3d.train_lstm(X_lstm, y) else: self.models['LSTM_3d'] = lstm_model_3d.to_dict() self.save(self.cluster_dir)
def fit(self, cvs): logger = logging.getLogger('log_fs_permutation') logger.setLevel(logging.INFO) handler = logging.FileHandler( os.path.join(self.log_dir, 'log_fs_perm.log'), 'w') handler.setLevel(logging.INFO) # create a logging format formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # add the handlers to the logger logger.addHandler(handler) print() print('Training the model (Fitting to the training data) ') logger.info('Training the feature extraction ') method = 'svm' regressor = sklearn_model(self.log_dir, 1, method, self.njobs) regressor.train(cvs) # Update classifier parameters estimator = regressor.model features = np.arange(cvs[0][0].shape[1]) np.random.shuffle(features) # features=features[np.argsort(estimator.feature_importances_)] acc_test = regressor.acc_test cv_result = regressor.cv_results.nlargest(10, 'acc')['params'].to_list() flag = True cvs_temp = copy.deepcopy(cvs) remove_features = [] keep_features = [] unchecked = np.copy(features) while flag: for f in unchecked: features_temp = np.hstack( (np.array(keep_features), np.delete(unchecked, np.where(unchecked == f)))).astype('int') reg_temp = sklearn_model(os.path.join(self.log_dir, 'temp'), 1, method, self.njobs) for i in range(3): cvs_temp[i][0] = copy.deepcopy(cvs[i][0][:, features_temp]) cvs_temp[i][2] = copy.deepcopy(cvs[i][2][:, features_temp]) cvs_temp[i][4] = copy.deepcopy(cvs[i][4][:, features_temp]) reg_temp.train(cvs_temp) cv_result = reg_temp.cv_results.nlargest( 5, 'acc')['params'].to_list() if reg_temp.acc_test < acc_test: logger.info('Remove feature %s accuracy: %s', str(f), str(reg_temp.acc_test)) remove_features.append(f) unchecked = np.delete(unchecked, np.where(unchecked == f)) acc_test = reg_temp.acc_test break else: logger.info('ADD feature %s accuracy: %s', str(f), str(reg_temp.acc_test)) keep_features.append(f) unchecked = np.delete(unchecked, np.where(unchecked == f)) if unchecked.shape[0] == 0: flag = False else: np.random.shuffle(unchecked) features = np.array(keep_features) self.features = features logger.info('Number of variables %s', str(self.features.shape[0])) logger.info('Finish the feature extraction ') return features
def fit(self, cvs): logger = logging.getLogger('log_fs_boruta.log') logger.setLevel(logging.INFO) handler = logging.FileHandler( os.path.join(self.log_dir, 'log_fs_boruta.log'), 'w') handler.setLevel(logging.INFO) # create a logging format formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # add the handlers to the logger logger.addHandler(handler) print() print('Training the model (Fitting to the training data) ') logger.info('Training the feature extraction ') X = np.vstack((cvs[0][0], cvs[0][2], cvs[0][4])) if len(cvs[0][1].shape) == 1 and len(cvs[0][5].shape) == 1: y = np.hstack((cvs[0][1], cvs[0][3], cvs[0][5])) else: y = np.vstack((cvs[0][1], cvs[0][3], cvs[0][5])).ravel() self.D, self.N = X.shape regressor = sklearn_model(self.log_dir, 1, 'rf', self.njobs) if regressor.istrained == False: regressor.train(cvs) # Update classifier parameters estimator = regressor.model estimator.set_params(n_jobs=-1) self.init_params = [regressor.best_params] # Define steps step1 = {'Constant Features': {'frac_constant_values': 0.999}} step2 = {'Correlated Features': {'correlation_threshold': 0.999}} step3 = { 'Relevant Features': { 'cv': 3, 'estimator': estimator, 'n_estimators': 500, 'max_iter': 20, 'verbose': 0, 'random_state': 42 } } step4 = { 'RFECV Features': { 'cv': 3, 'estimator': estimator, 'step': 1, 'scoring': 'neg_root_mean_squared_error', 'verbose': 50 } } # Place steps in a list in the order you want them execute it steps = [step1, step2, step3] columns = ['other_' + str(i) for i in range(X.shape[1])] X_df = pd.DataFrame(X, columns=columns) # Initialize FeatureSelector() fs = FeatureSelector() # Apply feature selection methods in the order they appear in steps fs.fit(X_df, y.ravel(), steps) features = [ i for i in range(len(X_df.columns)) if X_df.columns[i] in fs.selected_features ] # Get selected features self.features = np.array(features) # logger.info('best score %s', str(best_score)) logger.info('Number of variables %s', str(self.features.shape[0])) logger.info('Finish the feature extraction ') return features
def train(self, lstm=False): if len(self.combine_methods) > 1: if os.path.exists( os.path.join(self.data_dir, 'predictions_by_method.pickle')): pred_cluster = joblib.load( os.path.join(self.data_dir, 'predictions_by_cluster.pickle')) predictions = joblib.load( os.path.join(self.data_dir, 'predictions_by_method.pickle')) y = pd.read_csv(os.path.join(self.data_dir, 'target_test.csv'), index_col=0, header=[0], parse_dates=True, dayfirst=True) self.models = dict() if lstm: X = np.array([]) combine_method = 'lstm_full' for clust in pred_cluster.keys(): x = np.array([]) for method in pred_cluster[clust]: if method in self.methods: tmp = np.zeros_like(y.values.reshape(-1, 1)) try: tmp[pred_cluster[clust][ 'index']] = pred_cluster[clust][method] except: tmp[pred_cluster[clust] ['index']] = pred_cluster[clust][ method].reshape(-1, 1) if x.shape[0] == 0: x = tmp else: x = np.hstack((x, tmp)) if X.shape[0] == 0: X = np.copy(x) elif len(X.shape) == 2: X = np.stack((X, x)) else: X = np.vstack((X, x[np.newaxis, :, :])) X = np.transpose(X, [1, 0, 2]).astype('float') y_pred = y.values / 20 self.models[combine_method] = self.lstm_fit(X, y_pred, full=True) X = np.array([]) combine_method = 'lstm_combine' for clust in pred_cluster.keys(): x = np.array([]) for method in pred_cluster[clust]: if method in self.combine_methods: tmp = np.zeros_like(y.values.reshape(-1, 1)) try: tmp[pred_cluster[clust][ 'index']] = pred_cluster[clust][method] except: tmp[pred_cluster[clust] ['index']] = pred_cluster[clust][ method].reshape(-1, 1) if x.shape[0] == 0: x = tmp else: x = np.hstack((x, tmp)) if X.shape[0] == 0: X = np.copy(x) elif len(X.shape) == 2: X = np.stack((X, x)) else: X = np.vstack((X, x[np.newaxis, :, :])) X = np.transpose(X, [1, 0, 2]).astype('float') y_pred = y.values / 20 self.models[combine_method] = self.lstm_fit(X, y_pred) for method in self.combine_methods: pred = predictions[method].values.astype('float') pred[np.where(np.isnan(pred))] = 0 pred /= 20 y_pred = y.values / 20 cvs = [] for _ in range(3): X_train, X_test1, y_train, y_test1 = train_test_split( pred, y_pred, test_size=0.15) X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.15) cvs.append( [X_train, y_train, X_val, y_val, X_test1, y_test1]) mlp_model = sklearn_model(self.model_dir + '/' + method, self.rated, 'mlp', self.n_jobs) if mlp_model.istrained == False: self.models['mlp_' + method] = mlp_model.train(cvs) else: self.models['mlp_' + method] = mlp_model.to_dict() combine_method = 'bcp' for method in self.combine_methods: self.models['bcp_' + method] = self.bcp_fit( predictions[method].values.astype('float'), y.values) else: raise ValueError('Prediction of regressors missing') else: self.combine_methods = ['average'] self.istrained = True self.save(self.model_dir) return self.to_dict()