def gmdh_regression(self): self.gmdh_model = Regressor( ref_functions=(self._gmdh_ref_functions), criterion_type=self._criterion_type, feature_names=self.exp_feature_names, criterion_minimum_width=5, stop_train_epsilon_condition=self._epsilon, layer_err_criterion='top', l2=0.5, seq_type=self._seq_type, max_layer_count=50, normalize=True, keep_partial_neurons=False, admix_features=self._admix_features, manual_best_neurons_selection=self._manual_best_neurons_selection, min_best_neurons_count=self._min_best_neurons_count, n_jobs=self._n_jobs) self.gmdh_model.fit(self.X_T_L, self.Y) selected_features = len( self.gmdh_model.get_selected_features_indices()) print("selected features ", selected_features) print("=============================================") self.data = pd.DataFrame() selected_indices = self.gmdh_model.get_selected_features_indices() feature_count = len(self.exp_feature_names) self.selected_list = [] self.primitive_list = [] for order in range(1, self._hdmr_order + 1): for combo in combinations(selected_indices, order): header = '' series = [] primitive_name = [] derived_name = [] for i in combo: if header == '': header = self.exp_feature_names[i] series = self.X_T_L[self.exp_feature_names[i]] primitive_name.append(self.primitive_variables[i]) derived_name.append(self.exp_feature_names[i]) else: header = header + '*' + self.exp_feature_names[i] feature_name = self.exp_feature_names[i] series = series * self.X_T_L[self.exp_feature_names[i]] primitive_name.append(self.primitive_variables[i]) derived_name.append(self.exp_feature_names[i]) duplicates = pd.Series(primitive_name)[pd.Series( primitive_name).duplicated()].values result = 'NO duplicates' if len(duplicates) > 0: result = 'duplicates' else: self.data[header] = series self.selected_list.append(derived_name) self.primitive_list.append(primitive_name)
n = n_samples // 2 if train_data_is_the_first_half: train_x = boston.data[:n] train_y = boston.target[:n] test_x = boston.data[n:] test_y = boston.target[n:] else: train_x = boston.data[n:] train_y = boston.target[n:] test_x = boston.data[:n] test_y = boston.target[:n] model = Regressor(ref_functions=('linear_cov', ), criterion_type='validate', feature_names=boston.feature_names, criterion_minimum_width=5, stop_train_epsilon_condition=0.001, layer_err_criterion='top', l2=0.5, n_jobs='max') model.fit(train_x, train_y) # Now predict the value of the second half: y_pred = model.predict(test_x) mse = metrics.mean_squared_error(test_y, y_pred) mae = metrics.mean_absolute_error(test_y, y_pred) print("mse error on test set: {mse:0.2f}".format(mse=mse)) print("mae error on test set: {mae:0.2f}".format(mae=mae)) y_pred = model.predict(test_x)
train_data_is_the_first_half = False n = n_samples // 2 if train_data_is_the_first_half: train_x = data[:n] train_y = target[:n] test_x = data[n:] test_y = target[n:] else: train_x = data[n:] train_y = target[n:] test_x = data[:n] test_y = target[:n] model = Regressor(ref_functions='linear_cov', feature_names=iris.feature_names, criterion_minimum_width=5, stop_train_epsilon_condition=0.0001, l2=0.5, n_jobs=4) model.fit(train_x, train_y) # Now predict the value of the second half: # predict with GMDH model pred_y_row = model.predict(test_x) pred_y = viris_class(pred_y_row) print(model.get_selected_features_indices()) print(model.get_unselected_features_indices()) print("Selected features: {}".format(model.get_selected_features()))
if __name__ == '__main__': # generate points x = np.linspace(-2, 10, 200) n_samples = x.shape[0] # add random noise eps = 1.5 eps_data = np.random.uniform(-eps, eps, (n_samples, )) y = f(x) train_y = y[:] + eps_data[:] train_x = np.vstack((x, np.power(x, 2))) model = Regressor(ref_functions=('linear_cov', 'quad'), manual_best_neurons_selection=True, min_best_neurons_count=30, n_jobs='max') # train model model.fit(train_x, train_y) # predict with GMDH y_pred = model.predict(train_x) plt.plot(x, y, label="ground truth") plt.scatter(x, train_y, label="training points") plt.plot(x, y_pred, label="fit") plt.legend(loc='lower left') plt.show()
'admix_features': True, # default value 'criterion_type': 'validate', # default value 'seq_type': 'mode1', # default value 'max_layer_count': 100, # default value is sys.maxsize 'criterion_minimum_width': 5, # default value 'stop_train_epsilon_condition': 0.0001, # default value is 0.001 'manual_best_neurons_selection': False, # default value 'ref_functions': 'linear_cov', # default value 'normalize': True, # default value 'layer_err_criterion': 'top', # default value 'n_jobs': 1, # default value 'feature_names': boston.feature_names, 'l2_bis': (1e-5, 1e-4, 1e-3, 0.01, 0.1, 1.0, 10.0) } model = Regressor(**params) ''' model = Regressor(ref_functions=('linear_cov',), criterion_type='validate', feature_names=boston.feature_names, criterion_minimum_width=5, stop_train_epsilon_condition=0.001, layer_err_criterion='top', l2=0.5, n_jobs='max') ''' model.fit(train_x, train_y) # Now predict the value of the second half: y_pred = model.predict(test_x) mse = metrics.mean_squared_error(test_y, y_pred)
test_y2 = y2_noisy[n:] else: train_x = dataset[n:] train_y1 = y1_noisy[n:] train_y2 = y2_noisy[n:] test_x = dataset[:n] test_y1 = y1_noisy[:n] test_y2 = y2_noisy[:n] feature_names = ['ones','x','x*x'] # Models model_y1 = Regressor( ref_functions=('linear_cov',), normalize=True, criterion_minimum_width=5, stop_train_epsilon_condition=0.0001, layer_err_criterion='top', # l2=0.01, l2_bis=(0.0001,0.001,0.01,0.1,1.0,10.0), feature_names=feature_names ) model_y1.fit(train_x, train_y1) print() print("model_y1 :") print(model_y1.describe()) # Now predict the value of the second half: y1_pred = model_y1.predict(test_x) # Selected/unselected features: print("Selected features: {}".format(model_y1.get_selected_features())) print("Unselected features: {}".format(model_y1.get_unselected_features()))
if __name__ == '__main__': # generate points x = np.linspace(-2, 10, 200) n_samples = x.shape[0] # add random noise eps = 1.5 eps_data = np.random.uniform(-eps, eps, (n_samples,)) y = f(x) train_y = y[:] + eps_data[:] train_x = np.vstack((x, np.power(x, 2))) model = Regressor(ref_functions=('linear_cov', 'quad'), manual_best_neurons_selection=True, min_best_neurons_count=30, n_jobs='max') # train model model.fit(train_x, train_y) # predict with GMDH y_pred = model.predict(train_x) plt.plot(x, y, label="ground truth") plt.scatter(x, train_y, label="training points") plt.plot(x, y_pred, label="fit") plt.legend(loc='lower left') plt.show()
def slicing(raw, size): i = 0 while len(raw) > i + size: yield raw[i:i + size], raw[i + size] i += 1 train_x = [] train_y = [] SIZE = 5 TEST_OFFSET = -SIZE - 1 for x, y in slicing(data, SIZE): train_x.append(x) train_y.append((y,)) model = Regressor() model.fit(train_x, train_y) predicted = [] for x in range(4): predict_y = model.predict([data[TEST_OFFSET - x:TEST_OFFSET - x + SIZE]]) predicted.append(predict_y[0]) for x in range(1, 5): predicted.append(model.predict([(data + predicted)[-SIZE:]])[0]) predicted = [p * max(raw) for p in predicted] print predicted raw += test
n = n_samples // 2 if train_data_is_the_first_half: train_x = boston.data[:n] train_y = boston.target[:n] test_x = boston.data[n:] test_y = boston.target[n:] else: train_x = boston.data[n:] train_y = boston.target[n:] test_x = boston.data[:n] test_y = boston.target[:n] model = Regressor(ref_functions=('linear_cov',), criterion_type='validate', feature_names=boston.feature_names, criterion_minimum_width=5, stop_train_epsilon_condition=0.001, layer_err_criterion='top', l2=0.5, n_jobs='max') model.fit(train_x, train_y) # Now predict the value of the second half: y_pred = model.predict(test_x) mse = metrics.mean_squared_error(test_y, y_pred) mae = metrics.mean_absolute_error(test_y, y_pred) print("mse error on test set: {mse:0.2f}".format(mse=mse)) print("mae error on test set: {mae:0.2f}".format(mae=mae)) y_pred = model.predict(test_x)
class rshdmr(): def __init__(self, data_file, poly_order=4, **kwargs): self._seq_type = 'mode1' self._poly_order = poly_order self._gmdh_ref_functions = 'linear_cov' self._admix_features = True self._alpha_ridge = 0.5 self._alpha_lasso = 0.001 self._epsilon = 0.001 self._cutoff = 0.0001 self._regression_type = 'lasso' self._criterion_type = 'validate' self._hdmr_order = 2 self._index_cutoff = 0.01 self._manual_best_neurons_selection = False self._min_best_neurons_count = 20 self._n_jobs = 1 for key, value in kwargs.items(): setattr(self, "_" + key, value) self.read_data(data_file) def read_data(self, data_file): """ Read in from either dataframe or csv file """ if isinstance(data_file, pd.DataFrame): print(' found a dataframe') df = data_file if isinstance(data_file, str): df = pd.read_csv(data_file) self.Y = df['Y'] self.X = df.drop('Y', axis=1) # we can clean up the original dataframe del df def shift_legendre(self, n, x): funct = math.sqrt(2 * n + 1) * sp.eval_sh_legendre(n, x) return funct def transform_data(self): self.X_T = pd.DataFrame() self.ranges = {} feature_names = list(self.X.columns.values) print(feature_names) for column in feature_names: max = self.X[column].max() min = self.X[column].min() print(column + " : min " + str(min) + " max " + str(max)) self.X_T[column] = (self.X[column] - min) / (max - min) self.ranges[column] = [min, max] def legendre_expand(self): self.primitive_variables = [] self.poly_orders = [] self.X_T_L = pd.DataFrame() for column in self.X_T: for n in range(1, self._poly_order + 1): self.primitive_variables.append(column) self.poly_orders.append(n) column_heading = column + "_" + str(n) self.X_T_L[column_heading] = [ self.shift_legendre(n, x) for x in self.X_T[column] ] self.exp_feature_names = list(self.X_T_L.columns.values) def gmdh_regression(self): self.gmdh_model = Regressor( ref_functions=(self._gmdh_ref_functions), criterion_type=self._criterion_type, feature_names=self.exp_feature_names, criterion_minimum_width=5, stop_train_epsilon_condition=self._epsilon, layer_err_criterion='top', l2=0.5, seq_type=self._seq_type, max_layer_count=50, normalize=True, keep_partial_neurons=False, admix_features=self._admix_features, manual_best_neurons_selection=self._manual_best_neurons_selection, min_best_neurons_count=self._min_best_neurons_count, n_jobs=self._n_jobs) self.gmdh_model.fit(self.X_T_L, self.Y) selected_features = len( self.gmdh_model.get_selected_features_indices()) print("selected features ", selected_features) print("=============================================") self.data = pd.DataFrame() selected_indices = self.gmdh_model.get_selected_features_indices() feature_count = len(self.exp_feature_names) self.selected_list = [] self.primitive_list = [] for order in range(1, self._hdmr_order + 1): for combo in combinations(selected_indices, order): header = '' series = [] primitive_name = [] derived_name = [] for i in combo: if header == '': header = self.exp_feature_names[i] series = self.X_T_L[self.exp_feature_names[i]] primitive_name.append(self.primitive_variables[i]) derived_name.append(self.exp_feature_names[i]) else: header = header + '*' + self.exp_feature_names[i] feature_name = self.exp_feature_names[i] series = series * self.X_T_L[self.exp_feature_names[i]] primitive_name.append(self.primitive_variables[i]) derived_name.append(self.exp_feature_names[i]) duplicates = pd.Series(primitive_name)[pd.Series( primitive_name).duplicated()].values result = 'NO duplicates' if len(duplicates) > 0: result = 'duplicates' else: self.data[header] = series self.selected_list.append(derived_name) self.primitive_list.append(primitive_name) def ridge_regression(self, **kwargs): if self._regression_type == 'lasso': self.ridgereg = LassoCV(max_iter=50000) #self.ridgereg = LassoCV(max_iter=1e5, cv=10) self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'ard': self.ridgereg = ARDRegression() self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'elastic': self.ridgereg = ElasticNetCV(cv=10) self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'lars': self.ridgereg = LarsCV(cv=10) self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'lassolars': self.ridgereg = LassoLarsCV(cv=5) self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'ordinary': self.ridgereg = LinearRegression() self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'ridge': self.ridgereg = RidgeCV() self.ridgereg.fit(self.data, self.Y) def eval_sobol_indices(self): total_variance = np.var(self.Y) self.sobol_indexes = pd.DataFrame(columns=['index', 'value']) total_coeff_squared = 0 for i in range(0, len(self.primitive_list)): total_coeff_squared += self.ridgereg.coef_[ i] * self.ridgereg.coef_[i] print('total coeff squared : ', total_coeff_squared) print('total variance : ', total_variance) a = self.primitive_list b = [] for i in a: if sorted(i) not in b: b.append(sorted(i)) for unique in b: key = '' for variable_name in unique: key += ',' + variable_name key = key[1:] coeff_squared = 0 for i in range(0, len(self.primitive_list)): if sorted(self.primitive_list[i]) == sorted(unique): coeff_squared += self.ridgereg.coef_[ i] * self.ridgereg.coef_[i] # index = coeff_squared / total_coeff_squared index = coeff_squared / total_variance self.sobol_indexes.loc[len(self.sobol_indexes)] = [key, index] def predict(self, X): sum = self.ridgereg.intercept_ primitives = list(self.X.columns.values) X_expanded = {} for i in range(0, len(X)): # Transform input min = self.ranges[primitives[i]][0] max = self.ranges[primitives[i]][1] X_T = (X[i] - min) / (max - min) for j in range(1, self._poly_order + 1): label = primitives[i] + '_' + str(j) legendre = self.shift_legendre(j, X_T) X_expanded[label] = legendre # print(X_expanded) sum = self.ridgereg.intercept_ for i in range(0, len(self.ridgereg.coef_)): coeff = self.ridgereg.coef_[i] product = 1 terms = self.selected_list[i] for term in terms: product *= X_expanded[term] sum += coeff * product return sum def evaluate_func(self, X): sum = self.ridgereg.intercept_ primitives = list(self.X.columns.values) X_expanded = {} for i in range(0, len(X)): # Transform input min = self.ranges[primitives[i]][0] max = self.ranges[primitives[i]][1] X_T = (X[i] - min) / (max - min) for j in range(1, self._poly_order + 1): label = primitives[i] + '_' + str(j) legendre = self.shift_legendre(j, X_T) X_expanded[label] = [legendre] for key in self.ridge_coeffs: gmdh_coeff = self.selected_features_dict[key] ridge_coeff = self.ridge_coeffs[key][1] if len(gmdh_coeff) == 3: variable_term = X_expanded[gmdh_coeff[0]][0] sum += variable_term * ridge_coeff else: variable_term = X_expanded[gmdh_coeff[0]][0] * X_expanded[ gmdh_coeff[1]][0] sum += variable_term * ridge_coeff return sum def plot_hdmr(self): y_pred = self.ridgereg.predict(self.data) matplotlib.pyplot.scatter(self.Y, y_pred) matplotlib.pyplot.ylabel('Predicted') matplotlib.pyplot.xlabel('Experimental') matplotlib.pyplot.show() def stats(self): y_pred = self.ridgereg.predict(self.data) mse = metrics.mean_squared_error(y_pred, self.Y) mae = metrics.mean_absolute_error(y_pred, self.Y) evs = metrics.explained_variance_score(y_pred, self.Y) slope, intercept, r_value, p_value, std_err = linregress( self.Y, y_pred) print("mae error on test set : {mae:0.3f}".format(mae=mae)) print("mse error on test set : {mse:0.3f}".format(mse=mse)) print("explained variance score: {evs:0.3f}".format(evs=evs)) print("===============================") print("slope : ", slope) print("r value : ", r_value) print("r^2 : ", r_value * r_value) print("p value : ", p_value) print("std error : ", std_err) def print_sobol_indices(self): self.eval_sobol_indices() for i, row in self.sobol_indexes.iterrows(): if row['value'] > self._index_cutoff: print(row['index'], ' : ', row['value']) def auto(self): self.transform_data() self.legendre_expand() print('====================================') self.gmdh_regression() print('====================================') self.ridge_regression() self.print_sobol_indices() self.plot_hdmr()