예제 #1
0
    def gmdh_regression(self):
        self.gmdh_model = Regressor(
            ref_functions=(self._gmdh_ref_functions),
            criterion_type=self._criterion_type,
            feature_names=self.exp_feature_names,
            criterion_minimum_width=5,
            stop_train_epsilon_condition=self._epsilon,
            layer_err_criterion='top',
            l2=0.5,
            seq_type=self._seq_type,
            max_layer_count=50,
            normalize=True,
            keep_partial_neurons=False,
            admix_features=self._admix_features,
            manual_best_neurons_selection=self._manual_best_neurons_selection,
            min_best_neurons_count=self._min_best_neurons_count,
            n_jobs=self._n_jobs)
        self.gmdh_model.fit(self.X_T_L, self.Y)

        selected_features = len(
            self.gmdh_model.get_selected_features_indices())
        print("selected features ", selected_features)
        print("=============================================")
        self.data = pd.DataFrame()
        selected_indices = self.gmdh_model.get_selected_features_indices()
        feature_count = len(self.exp_feature_names)
        self.selected_list = []
        self.primitive_list = []
        for order in range(1, self._hdmr_order + 1):
            for combo in combinations(selected_indices, order):
                header = ''
                series = []
                primitive_name = []
                derived_name = []
                for i in combo:
                    if header == '':
                        header = self.exp_feature_names[i]
                        series = self.X_T_L[self.exp_feature_names[i]]
                        primitive_name.append(self.primitive_variables[i])
                        derived_name.append(self.exp_feature_names[i])
                    else:
                        header = header + '*' + self.exp_feature_names[i]
                        feature_name = self.exp_feature_names[i]
                        series = series * self.X_T_L[self.exp_feature_names[i]]
                        primitive_name.append(self.primitive_variables[i])
                        derived_name.append(self.exp_feature_names[i])
                duplicates = pd.Series(primitive_name)[pd.Series(
                    primitive_name).duplicated()].values
                result = 'NO duplicates'
                if len(duplicates) > 0:
                    result = 'duplicates'
                else:
                    self.data[header] = series
                    self.selected_list.append(derived_name)
                    self.primitive_list.append(primitive_name)
예제 #2
0
    n = n_samples // 2
    if train_data_is_the_first_half:
        train_x = boston.data[:n]
        train_y = boston.target[:n]
        test_x = boston.data[n:]
        test_y = boston.target[n:]
    else:
        train_x = boston.data[n:]
        train_y = boston.target[n:]
        test_x = boston.data[:n]
        test_y = boston.target[:n]

    model = Regressor(ref_functions=('linear_cov', ),
                      criterion_type='validate',
                      feature_names=boston.feature_names,
                      criterion_minimum_width=5,
                      stop_train_epsilon_condition=0.001,
                      layer_err_criterion='top',
                      l2=0.5,
                      n_jobs='max')
    model.fit(train_x, train_y)

    # Now predict the value of the second half:
    y_pred = model.predict(test_x)
    mse = metrics.mean_squared_error(test_y, y_pred)
    mae = metrics.mean_absolute_error(test_y, y_pred)

    print("mse error on test set: {mse:0.2f}".format(mse=mse))
    print("mae error on test set: {mae:0.2f}".format(mae=mae))

    y_pred = model.predict(test_x)
예제 #3
0
    train_data_is_the_first_half = False
    n = n_samples // 2
    if train_data_is_the_first_half:
        train_x = data[:n]
        train_y = target[:n]
        test_x = data[n:]
        test_y = target[n:]
    else:
        train_x = data[n:]
        train_y = target[n:]
        test_x = data[:n]
        test_y = target[:n]

    model = Regressor(ref_functions='linear_cov',
                      feature_names=iris.feature_names,
                      criterion_minimum_width=5,
                      stop_train_epsilon_condition=0.0001,
                      l2=0.5,
                      n_jobs=4)
    model.fit(train_x, train_y)

    # Now predict the value of the second half:
    # predict with GMDH model
    pred_y_row = model.predict(test_x)

    pred_y = viris_class(pred_y_row)


    print(model.get_selected_features_indices())
    print(model.get_unselected_features_indices())

    print("Selected features: {}".format(model.get_selected_features()))
예제 #4
0
if __name__ == '__main__':

    # generate points
    x = np.linspace(-2, 10, 200)
    n_samples = x.shape[0]

    # add random noise
    eps = 1.5
    eps_data = np.random.uniform(-eps, eps, (n_samples, ))
    y = f(x)
    train_y = y[:] + eps_data[:]

    train_x = np.vstack((x, np.power(x, 2)))
    model = Regressor(ref_functions=('linear_cov', 'quad'),
                      manual_best_neurons_selection=True,
                      min_best_neurons_count=30,
                      n_jobs='max')

    # train model
    model.fit(train_x, train_y)

    # predict with GMDH
    y_pred = model.predict(train_x)

    plt.plot(x, y, label="ground truth")
    plt.scatter(x, train_y, label="training points")
    plt.plot(x, y_pred, label="fit")
    plt.legend(loc='lower left')

    plt.show()
예제 #5
0
        'admix_features': True,  # default value
        'criterion_type': 'validate',  # default value
        'seq_type': 'mode1',  # default value
        'max_layer_count': 100,  # default value is sys.maxsize
        'criterion_minimum_width': 5,  # default value
        'stop_train_epsilon_condition': 0.0001,  # default value is 0.001
        'manual_best_neurons_selection': False,  # default value
        'ref_functions': 'linear_cov',  # default value
        'normalize': True,  # default value
        'layer_err_criterion': 'top',  # default value
        'n_jobs': 1,  # default value
        'feature_names': boston.feature_names,
        'l2_bis': (1e-5, 1e-4, 1e-3, 0.01, 0.1, 1.0, 10.0)
    }

    model = Regressor(**params)
    '''
    model = Regressor(ref_functions=('linear_cov',),
                     criterion_type='validate',
                      feature_names=boston.feature_names,
                      criterion_minimum_width=5,
                      stop_train_epsilon_condition=0.001,
                      layer_err_criterion='top',
                      l2=0.5,
                      n_jobs='max')
    '''
    model.fit(train_x, train_y)

    # Now predict the value of the second half:
    y_pred = model.predict(test_x)
    mse = metrics.mean_squared_error(test_y, y_pred)
예제 #6
0
        test_y2 = y2_noisy[n:]
    else:
        train_x = dataset[n:]
        train_y1 = y1_noisy[n:]
        train_y2 = y2_noisy[n:]
        test_x = dataset[:n]
        test_y1 = y1_noisy[:n]
        test_y2 = y2_noisy[:n]
    feature_names = ['ones','x','x*x']

    # Models

    model_y1 = Regressor( ref_functions=('linear_cov',),
                    normalize=True,
                    criterion_minimum_width=5,
                    stop_train_epsilon_condition=0.0001,
                    layer_err_criterion='top',
    #                  l2=0.01,
                    l2_bis=(0.0001,0.001,0.01,0.1,1.0,10.0),
                    feature_names=feature_names )
    model_y1.fit(train_x, train_y1)
    print()
    print("model_y1 :")
    print(model_y1.describe())

    # Now predict the value of the second half:
    y1_pred = model_y1.predict(test_x)

    # Selected/unselected features:
    print("Selected features: {}".format(model_y1.get_selected_features()))
    print("Unselected features: {}".format(model_y1.get_unselected_features()))
예제 #7
0
if __name__ == '__main__':

    # generate points
    x = np.linspace(-2, 10, 200)
    n_samples = x.shape[0]

    # add random noise
    eps = 1.5
    eps_data = np.random.uniform(-eps, eps, (n_samples,))
    y = f(x)
    train_y = y[:] + eps_data[:]

    train_x = np.vstack((x, np.power(x, 2)))
    model = Regressor(ref_functions=('linear_cov', 'quad'),
                      manual_best_neurons_selection=True,
                      min_best_neurons_count=30,
                      n_jobs='max')

    # train model
    model.fit(train_x, train_y)

    # predict with GMDH
    y_pred = model.predict(train_x)

    plt.plot(x, y, label="ground truth")
    plt.scatter(x, train_y, label="training points")
    plt.plot(x, y_pred, label="fit")
    plt.legend(loc='lower left')

    plt.show()
예제 #8
0
파일: my_gmdh.py 프로젝트: LifeMoroz/Future
def slicing(raw, size):
    i = 0
    while len(raw) > i + size:
        yield raw[i:i + size], raw[i + size]
        i += 1


train_x = []
train_y = []
SIZE = 5
TEST_OFFSET = -SIZE - 1
for x, y in slicing(data, SIZE):
    train_x.append(x)
    train_y.append((y,))

model = Regressor()
model.fit(train_x, train_y)


predicted = []
for x in range(4):
    predict_y = model.predict([data[TEST_OFFSET - x:TEST_OFFSET - x + SIZE]])
    predicted.append(predict_y[0])

for x in range(1, 5):
    predicted.append(model.predict([(data + predicted)[-SIZE:]])[0])

predicted = [p * max(raw) for p in predicted]

print predicted
raw += test
예제 #9
0
    n = n_samples // 2
    if train_data_is_the_first_half:
        train_x = boston.data[:n]
        train_y = boston.target[:n]
        test_x = boston.data[n:]
        test_y = boston.target[n:]
    else:
        train_x = boston.data[n:]
        train_y = boston.target[n:]
        test_x = boston.data[:n]
        test_y = boston.target[:n]

    model = Regressor(ref_functions=('linear_cov',),
                      criterion_type='validate',
                      feature_names=boston.feature_names,
                      criterion_minimum_width=5,
                      stop_train_epsilon_condition=0.001,
                      layer_err_criterion='top',
                      l2=0.5,
                      n_jobs='max')
    model.fit(train_x, train_y)

    # Now predict the value of the second half:
    y_pred = model.predict(test_x)
    mse = metrics.mean_squared_error(test_y, y_pred)
    mae = metrics.mean_absolute_error(test_y, y_pred)

    print("mse error on test set: {mse:0.2f}".format(mse=mse))
    print("mae error on test set: {mae:0.2f}".format(mae=mae))

    y_pred = model.predict(test_x)
예제 #10
0
class rshdmr():
    def __init__(self, data_file, poly_order=4, **kwargs):
        self._seq_type = 'mode1'
        self._poly_order = poly_order
        self._gmdh_ref_functions = 'linear_cov'
        self._admix_features = True
        self._alpha_ridge = 0.5
        self._alpha_lasso = 0.001
        self._epsilon = 0.001
        self._cutoff = 0.0001
        self._regression_type = 'lasso'
        self._criterion_type = 'validate'
        self._hdmr_order = 2
        self._index_cutoff = 0.01
        self._manual_best_neurons_selection = False
        self._min_best_neurons_count = 20
        self._n_jobs = 1
        for key, value in kwargs.items():
            setattr(self, "_" + key, value)
        self.read_data(data_file)

    def read_data(self, data_file):
        """
        Read in from either dataframe or csv file
        """
        if isinstance(data_file, pd.DataFrame):
            print(' found a dataframe')
            df = data_file
        if isinstance(data_file, str):
            df = pd.read_csv(data_file)
        self.Y = df['Y']
        self.X = df.drop('Y', axis=1)
        # we can clean up the original dataframe
        del df

    def shift_legendre(self, n, x):
        funct = math.sqrt(2 * n + 1) * sp.eval_sh_legendre(n, x)
        return funct

    def transform_data(self):
        self.X_T = pd.DataFrame()
        self.ranges = {}
        feature_names = list(self.X.columns.values)
        print(feature_names)
        for column in feature_names:
            max = self.X[column].max()
            min = self.X[column].min()
            print(column + " : min " + str(min) + " max " + str(max))
            self.X_T[column] = (self.X[column] - min) / (max - min)
            self.ranges[column] = [min, max]

    def legendre_expand(self):
        self.primitive_variables = []
        self.poly_orders = []
        self.X_T_L = pd.DataFrame()
        for column in self.X_T:
            for n in range(1, self._poly_order + 1):
                self.primitive_variables.append(column)
                self.poly_orders.append(n)
                column_heading = column + "_" + str(n)
                self.X_T_L[column_heading] = [
                    self.shift_legendre(n, x) for x in self.X_T[column]
                ]
        self.exp_feature_names = list(self.X_T_L.columns.values)

    def gmdh_regression(self):
        self.gmdh_model = Regressor(
            ref_functions=(self._gmdh_ref_functions),
            criterion_type=self._criterion_type,
            feature_names=self.exp_feature_names,
            criterion_minimum_width=5,
            stop_train_epsilon_condition=self._epsilon,
            layer_err_criterion='top',
            l2=0.5,
            seq_type=self._seq_type,
            max_layer_count=50,
            normalize=True,
            keep_partial_neurons=False,
            admix_features=self._admix_features,
            manual_best_neurons_selection=self._manual_best_neurons_selection,
            min_best_neurons_count=self._min_best_neurons_count,
            n_jobs=self._n_jobs)
        self.gmdh_model.fit(self.X_T_L, self.Y)

        selected_features = len(
            self.gmdh_model.get_selected_features_indices())
        print("selected features ", selected_features)
        print("=============================================")
        self.data = pd.DataFrame()
        selected_indices = self.gmdh_model.get_selected_features_indices()
        feature_count = len(self.exp_feature_names)
        self.selected_list = []
        self.primitive_list = []
        for order in range(1, self._hdmr_order + 1):
            for combo in combinations(selected_indices, order):
                header = ''
                series = []
                primitive_name = []
                derived_name = []
                for i in combo:
                    if header == '':
                        header = self.exp_feature_names[i]
                        series = self.X_T_L[self.exp_feature_names[i]]
                        primitive_name.append(self.primitive_variables[i])
                        derived_name.append(self.exp_feature_names[i])
                    else:
                        header = header + '*' + self.exp_feature_names[i]
                        feature_name = self.exp_feature_names[i]
                        series = series * self.X_T_L[self.exp_feature_names[i]]
                        primitive_name.append(self.primitive_variables[i])
                        derived_name.append(self.exp_feature_names[i])
                duplicates = pd.Series(primitive_name)[pd.Series(
                    primitive_name).duplicated()].values
                result = 'NO duplicates'
                if len(duplicates) > 0:
                    result = 'duplicates'
                else:
                    self.data[header] = series
                    self.selected_list.append(derived_name)
                    self.primitive_list.append(primitive_name)

    def ridge_regression(self, **kwargs):
        if self._regression_type == 'lasso':
            self.ridgereg = LassoCV(max_iter=50000)
            #self.ridgereg = LassoCV(max_iter=1e5, cv=10)
            self.ridgereg.fit(self.data, self.Y)
        elif self._regression_type == 'ard':
            self.ridgereg = ARDRegression()
            self.ridgereg.fit(self.data, self.Y)
        elif self._regression_type == 'elastic':
            self.ridgereg = ElasticNetCV(cv=10)
            self.ridgereg.fit(self.data, self.Y)
        elif self._regression_type == 'lars':
            self.ridgereg = LarsCV(cv=10)
            self.ridgereg.fit(self.data, self.Y)
        elif self._regression_type == 'lassolars':
            self.ridgereg = LassoLarsCV(cv=5)
            self.ridgereg.fit(self.data, self.Y)
        elif self._regression_type == 'ordinary':
            self.ridgereg = LinearRegression()
            self.ridgereg.fit(self.data, self.Y)
        elif self._regression_type == 'ridge':
            self.ridgereg = RidgeCV()
            self.ridgereg.fit(self.data, self.Y)

    def eval_sobol_indices(self):
        total_variance = np.var(self.Y)
        self.sobol_indexes = pd.DataFrame(columns=['index', 'value'])
        total_coeff_squared = 0
        for i in range(0, len(self.primitive_list)):
            total_coeff_squared += self.ridgereg.coef_[
                i] * self.ridgereg.coef_[i]
        print('total coeff squared : ', total_coeff_squared)
        print('total variance : ', total_variance)

        a = self.primitive_list
        b = []
        for i in a:
            if sorted(i) not in b:
                b.append(sorted(i))

        for unique in b:
            key = ''
            for variable_name in unique:
                key += ',' + variable_name
            key = key[1:]

            coeff_squared = 0
            for i in range(0, len(self.primitive_list)):
                if sorted(self.primitive_list[i]) == sorted(unique):
                    coeff_squared += self.ridgereg.coef_[
                        i] * self.ridgereg.coef_[i]
            # index = coeff_squared / total_coeff_squared
            index = coeff_squared / total_variance
            self.sobol_indexes.loc[len(self.sobol_indexes)] = [key, index]

    def predict(self, X):
        sum = self.ridgereg.intercept_
        primitives = list(self.X.columns.values)
        X_expanded = {}
        for i in range(0, len(X)):
            # Transform input
            min = self.ranges[primitives[i]][0]
            max = self.ranges[primitives[i]][1]
            X_T = (X[i] - min) / (max - min)
            for j in range(1, self._poly_order + 1):
                label = primitives[i] + '_' + str(j)
                legendre = self.shift_legendre(j, X_T)
                X_expanded[label] = legendre
        # print(X_expanded)

        sum = self.ridgereg.intercept_
        for i in range(0, len(self.ridgereg.coef_)):
            coeff = self.ridgereg.coef_[i]
            product = 1
            terms = self.selected_list[i]
            for term in terms:
                product *= X_expanded[term]
            sum += coeff * product
        return sum

    def evaluate_func(self, X):
        sum = self.ridgereg.intercept_
        primitives = list(self.X.columns.values)
        X_expanded = {}
        for i in range(0, len(X)):
            # Transform input
            min = self.ranges[primitives[i]][0]
            max = self.ranges[primitives[i]][1]
            X_T = (X[i] - min) / (max - min)
            for j in range(1, self._poly_order + 1):
                label = primitives[i] + '_' + str(j)
                legendre = self.shift_legendre(j, X_T)
                X_expanded[label] = [legendre]

        for key in self.ridge_coeffs:
            gmdh_coeff = self.selected_features_dict[key]
            ridge_coeff = self.ridge_coeffs[key][1]
            if len(gmdh_coeff) == 3:
                variable_term = X_expanded[gmdh_coeff[0]][0]
                sum += variable_term * ridge_coeff
            else:
                variable_term = X_expanded[gmdh_coeff[0]][0] * X_expanded[
                    gmdh_coeff[1]][0]
                sum += variable_term * ridge_coeff
        return sum

    def plot_hdmr(self):
        y_pred = self.ridgereg.predict(self.data)
        matplotlib.pyplot.scatter(self.Y, y_pred)
        matplotlib.pyplot.ylabel('Predicted')
        matplotlib.pyplot.xlabel('Experimental')
        matplotlib.pyplot.show()

    def stats(self):
        y_pred = self.ridgereg.predict(self.data)
        mse = metrics.mean_squared_error(y_pred, self.Y)
        mae = metrics.mean_absolute_error(y_pred, self.Y)
        evs = metrics.explained_variance_score(y_pred, self.Y)
        slope, intercept, r_value, p_value, std_err = linregress(
            self.Y, y_pred)
        print("mae error on test set   : {mae:0.3f}".format(mae=mae))
        print("mse error on test set   : {mse:0.3f}".format(mse=mse))
        print("explained variance score: {evs:0.3f}".format(evs=evs))
        print("===============================")
        print("slope     : ", slope)
        print("r value   : ", r_value)
        print("r^2       : ", r_value * r_value)
        print("p value   : ", p_value)
        print("std error : ", std_err)

    def print_sobol_indices(self):
        self.eval_sobol_indices()
        for i, row in self.sobol_indexes.iterrows():
            if row['value'] > self._index_cutoff:
                print(row['index'], ' : ', row['value'])

    def auto(self):
        self.transform_data()
        self.legendre_expand()
        print('====================================')
        self.gmdh_regression()
        print('====================================')
        self.ridge_regression()
        self.print_sobol_indices()
        self.plot_hdmr()