Пример #1
0
def test_kneighbors_regressor():
    # Test chaining KNeighborsTransformer and classifiers/regressors
    rng = np.random.RandomState(0)
    X = 2 * rng.rand(40, 5) - 1
    X2 = 2 * rng.rand(40, 5) - 1
    y = rng.rand(40, 1)

    n_neighbors = 12
    radius = 1.5
    # We precompute more neighbors than necessary, to have equivalence between
    # k-neighbors estimator after radius-neighbors transformer, and vice-versa.
    factor = 2

    k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance")
    k_trans_factor = KNeighborsTransformer(n_neighbors=int(n_neighbors *
                                                           factor),
                                           mode="distance")

    r_trans = RadiusNeighborsTransformer(radius=radius, mode="distance")
    r_trans_factor = RadiusNeighborsTransformer(radius=int(radius * factor),
                                                mode="distance")

    k_reg = KNeighborsRegressor(n_neighbors=n_neighbors)
    r_reg = RadiusNeighborsRegressor(radius=radius)

    test_list = [
        (k_trans, k_reg),
        (k_trans_factor, r_reg),
        (r_trans, r_reg),
        (r_trans_factor, k_reg),
    ]

    for trans, reg in test_list:
        # compare the chained version and the compact version
        reg_compact = clone(reg)
        reg_precomp = clone(reg)
        reg_precomp.set_params(metric="precomputed")

        reg_chain = make_pipeline(clone(trans), reg_precomp)

        y_pred_chain = reg_chain.fit(X, y).predict(X2)
        y_pred_compact = reg_compact.fit(X, y).predict(X2)
        assert_array_almost_equal(y_pred_chain, y_pred_compact)
    def test_onnxruntime_knn_radius(self):
        def _get_reg_data(self, n, n_features, n_targets, n_informative=10):
            X, y = make_regression(  # pylint: disable=W0632
                n, n_features=n_features, random_state=0,
                n_targets=n_targets, n_informative=n_informative)
            return X, y

        def _fit_model(model, n_targets=1, label_int=False,
                       n_informative=10):
            X, y = _get_reg_data(20, 4, n_targets, n_informative)
            if label_int:
                y = y.astype(numpy.int64)
            model.fit(X, y)
            return model, X

        model, X = _fit_model(RadiusNeighborsRegressor())
        model_onnx = to_onnx(
            model, X[:1].astype(numpy.float32),
            target_opset=TARGET_OPSET,
            options={id(model): {'optim': 'cdist'}})
        oinf = OnnxInference(model_onnx, runtime='onnxruntime1')
        X = X[:7]
        got = oinf.run({'X': X.astype(numpy.float32)})['variable']
        exp = model.predict(X.astype(numpy.float32))
        if any(numpy.isnan(got.ravel())):
            # The model is unexpectedly producing nan values
            # sometimes.
            res = oinf.run({'X': X.astype(numpy.float32)}, intermediate=True)
            rows = ['--EXP--', str(exp), '--GOT--', str(got),
                    '--EVERY-OUTPUT--']
            for k, v in res.items():
                rows.append('-%s-' % k)
                rows.append(str(v))
            if any(map(numpy.isnan, res["variable"].ravel())):
                # raise AssertionError('\n'.join(rows))
                warnings.warn("Unexpected NaN values\n" + '\n'.join(rows))
                return
            # onnxruntime and mlprodict do not return the same
            # output
            warnings.warn('\n'.join(rows))
            return
        self.assertEqualArray(exp, got, decimal=4)
Пример #3
0
def compare_multiple_stacks(folder):
    subfolders = os.listdir(folder)

    all_data = []

    for subfolder in tqdm.tqdm(subfolders):
        all_data.append(load_images(os.path.join(folder, subfolder)))

    all_data = np.array(all_data)
    print(all_data.shape)

    for channel in range(3):
        for subfolder_index in range(all_data.shape[0]):

            channel_stack = all_data[subfolder_index][:, :, :, channel]

            img_mean = np.mean(channel_stack, axis=0)
            img_sigma_clip = np.mean(astropy.stats.sigma_clip(channel_stack,
                                                              sigma=2,
                                                              axis=0),
                                     axis=0)

            img_sigma_ratio = (img_mean / img_sigma_clip - 1) * 1E3
            skip = 1
            flat_ratios = img_sigma_ratio.flatten()[::skip]
            mean_values = img_mean.flatten()[::skip]

            # plt.scatter(mean_values, flat_ratios, alpha=0.1, color='black', s=1)

            rnr = RadiusNeighborsRegressor(radius=50, weights='uniform')
            rnr.fit(np.expand_dims(mean_values, axis=1), flat_ratios.flatten())

            x = np.arange(
                np.min(mean_values) + 200,
                np.max(mean_values) + 1 - 200, 10)
            line_y = rnr.predict(np.expand_dims(x, axis=1))
            plt.plot(x, line_y, label=str(subfolder_index))

        plt.legend()
        plt.grid(True)
        plt.show()
 def test_model_knn_regressor_radius(self):
     model, X = self._fit_model(RadiusNeighborsRegressor())
     model_onnx = convert_sklearn(model,
                                  "KNN regressor",
                                  [("input", FloatTensorType([None, 4]))],
                                  target_opset=TARGET_OPSET,
                                  options={id(model): {
                                               'optim': 'cdist'
                                           }})
     sess = InferenceSession(model_onnx.SerializeToString())
     got = sess.run(None, {'input': X.astype(numpy.float32)})[0]
     exp = model.predict(X.astype(numpy.float32))
     if any(numpy.isnan(got.ravel())):
         # The model is unexpectedly producing nan values
         # not on all platforms.
         rows = [
             '--EXP--',
             str(exp), '--GOT--',
             str(got), '--EVERY-OUTPUT--'
         ]
         for out in enumerate_model_node_outputs(model_onnx,
                                                 add_node=False):
             onx = select_model_inputs_outputs(model_onnx, out)
             sess = InferenceSession(onx.SerializeToString())
             res = sess.run(None, {'input': X.astype(numpy.float32)})
             rows.append('--{}--'.format(out))
             rows.append(str(res))
         if (StrictVersion(onnxruntime.__version__) <
                 StrictVersion("1.4.0")):
             return
         raise AssertionError('\n'.join(rows))
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(X.astype(numpy.float32)[:7],
                         model,
                         model_onnx,
                         basename="SklearnRadiusNeighborsRegressor")
     dump_data_and_model((X + 0.1).astype(numpy.float32)[:7],
                         model,
                         model_onnx,
                         basename="SklearnRadiusNeighborsRegressor")
Пример #5
0
def make_atmospheric_pressure_model(df):
    ds = load_ds(df, "pres")
    X_train, X_test, y_train, _ = ds

    # Build & fit the model
    model = make_pipeline(
        PCA(whiten=True),
        StandardScaler(),
        RadiusNeighborsRegressor(radius=0.014),
    )
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    plot_regression(
        "atmospheric_pressure",
        "Atmospheric pressure (Pa)",
        dataset=ds,
        y_pred=y_pred,
    )

    return model, ds, y_pred
Пример #6
0
def compare_error_vs_brightness(folder):
    data = load_images(folder)

    for channel in range(data.shape[3]):
        channel_stack = data[:, :, :, channel]

        img_mean = np.mean(channel_stack, axis=0)
        img_sigma_clip = np.mean(astropy.stats.sigma_clip(channel_stack,
                                                          sigma=2,
                                                          axis=0),
                                 axis=0)

        img_sigma_ratio = (img_mean / img_sigma_clip - 1) * 1E3

        x = np.arange(np.min(img_mean), np.max(img_mean) + 1)
        bit_flip_change = 128 if channel == 1 else 256
        y_top = ((channel_stack.shape[0] * x) /
                 (channel_stack.shape[0] * x - bit_flip_change) - 1) * 1E3
        y_bottom = ((channel_stack.shape[0] * x) /
                    (channel_stack.shape[0] * x + bit_flip_change) - 1) * 1E3
        plt.plot(x, y_top, 'r')
        plt.plot(x, y_bottom, 'r')
        plt.scatter(img_mean.flatten(),
                    img_sigma_ratio.flatten(),
                    alpha=0.1,
                    color='black',
                    s=1)

        rnr = RadiusNeighborsRegressor(radius=50, weights='distance')
        rnr.fit(np.expand_dims(img_mean.flatten(), axis=1),
                img_sigma_ratio.flatten())

        x = np.arange(np.min(img_mean), np.max(img_mean) + 1)
        line_y = rnr.predict(np.expand_dims(x, axis=1))
        plt.plot(x, line_y, 'g')

        plt.grid(True)
        plt.show()
Пример #7
0
def make_wind_speed_model(df):
    ds = load_ds(df, "ff")
    X_train, X_test, y_train, _ = ds

    # Build & fit the model
    model = make_pipeline(
        PCA(whiten=True),
        StandardScaler(),
        RadiusNeighborsRegressor(radius=0.02),
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    plot_regression(
        "wind_speed",
        "Average wind speed 10 min (m/s)",
        dataset=ds,
        y_pred=y_pred,
    )

    return model, ds, y_pred
Пример #8
0
    def modelBuild(self, train_x, train_y, selected_model='NN'):
        """训练模型"""
        if selected_model == 'NN':
            self.my_model = MLPRegressor(hidden_layer_sizes=(50),
                                         activation='relu',
                                         solver='adam',
                                         alpha=0.0001,
                                         batch_size='auto',
                                         learning_rate='adaptive',
                                         learning_rate_init=0.001)
        elif selected_model == 'DT':
            self.my_model = tree.DecisionTreeRegressor()
        elif selected_model == 'SVM':
            self.my_model = svm.SVR()
        elif selected_model == 'KNN':
            self.my_model == KNeighborsRegressor(n_neighbors=2)
        elif selected_model == 'RNN':
            self.my_model == RadiusNeighborsRegressor(radius=1.0)
        else:
            print "this model can not be built"
            return

        self.my_model.fit(train_x, train_y)
 def test_model_knn_regressor2_1_radius(self):
     model, X = self._fit_model_simple(
         RadiusNeighborsRegressor(algorithm="brute"), n_targets=2)
     model_onnx = convert_sklearn(
         model,
         "KNN regressor", [("input", FloatTensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertIsNotNone(model_onnx)
     sess = InferenceSession(model_onnx.SerializeToString())
     got = sess.run(None, {'input': X.astype(numpy.float32)})[0]
     exp = model.predict(X.astype(numpy.float32))
     if any(numpy.isnan(got.ravel())):
         # The model is unexpectedly producing nan values
         # not on all platforms.
         # It happens when two matrices are multiplied,
         # one is (2, 20, 20), second is (20, 20)
         # and contains only 0 or 1 values.
         # The output contains nan values on the first row
         # but not on the second one.
         rows = [
             '--EXP--',
             str(exp), '--GOT--',
             str(got), '--EVERY-OUTPUT--'
         ]
         for out in enumerate_model_node_outputs(model_onnx,
                                                 add_node=False):
             onx = select_model_inputs_outputs(model_onnx, out)
             sess = InferenceSession(onx.SerializeToString())
             res = sess.run(None, {'input': X.astype(numpy.float32)})
             rows.append('--{}--'.format(out))
             rows.append(str(res))
         if (StrictVersion(onnxruntime.__version__) <
                 StrictVersion("1.4.0")):
             return
         raise AssertionError('\n'.join(rows))
     assert_almost_equal(exp, got, decimal=5)
def grid_points_2d(mesh, cell_size=10):
    grid = vtk_Voxel.from_mesh(mesh, cell_size, 2)

    cells = grid.cell_centers().points

    radius = cell_size * 0.5
    tmat = np.full(cells.shape[0], np.nan)
    print("sample min", np.min(mesh.points[:, 2]), "max",
          np.max(mesh.points[:, 2]))
    while np.any(np.isnan(tmat)):
        # keep increasing radius until all cells have values
        radius *= 1.5
        print("RadiusNeighborsRegressor =", radius, "m")
        neigh = RadiusNeighborsRegressor(radius, 'distance')
        neigh.fit(mesh.points[:, :2], mesh.points[:, 2])
        rmat = neigh.predict(cells[:, :2])
        np.putmask(tmat, np.isnan(tmat), rmat)
    print("regression min", np.min(tmat), "max", np.max(tmat))
    grid.cell_arrays['Elevation'] = tmat
    surf = grid.extract_surface()
    surf = surf.ctp()
    surf.points[:, 2] = surf.point_arrays['Elevation']

    return surf
def powerproduction():
    if fl.request.method == "POST":
        speed = {}
        speed = float(fl.request.form['speed'])
        # speed = requests.get(data['input_s'])
        # import csv data and convert to pandas dataframe
        df = pd.read_csv("powerproduction.csv")

        # remove all zeros
        df = df[df.power != 0]

        # put rows in order of speed
        df = df.sort_values('speed')

        # set each column to a numpy array for processing
        S = df['speed'].to_numpy()
        p = df['power'].to_numpy()

        neigh_radius = RadiusNeighborsRegressor(radius=1.7, weights='distance', p = 2)
        neigh_radius.fit(S.reshape(-1, 1), p)

        p_pred = neigh_radius.predict([[speed]])

        return {'value': p_pred[0]}
Пример #12
0
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
tree_regressors = {
    "Decision_tree_regressor": DecisionTreeRegressor(),
    "AdaBoost_regressor": AdaBoostRegressor(),
    "Extra_trees_regressor": ExtraTreesRegressor(),
    "Random_forest_regressor": RandomForestRegressor(),  # Takes 55 seconds
    "GBM_regressor": GradientBoostingRegressor(),  #Takes forever
    "HGB_regressor": HistGradientBoostingRegressor(),
    "CATBoost_regressor": CatBoostRegressor(verbose=0),
    "lightgbm_regressor": LGBMRegressor(),
}
mult_regeressors = {
    "Linear_regression": LinearRegression(),  ### Dont use results were awful
    "Ridge_regressor": Ridge(),
    "SVM_regressor": SVR(),  # Takes 150  seconds
    "MLP_regressor": MLPRegressor(),
    "SGD_regressor": SGDRegressor(),
    "KNN_regressor": KNeighborsRegressor(),
    "BR_regressor": BayesianRidge(),
    "RNN_regressor": RadiusNeighborsRegressor(),  # Predicts NaN's :S
}
Пример #13
0
# Available optimisation on this machine.

print(code_optimisation())

##############################
# Building the model
# ++++++++++++++++++

filename = "onnx_to_profile.onnx"

if not os.path.exists(filename):
    print(f"Generate a graph for {filename!r}.")
    X = numpy.random.randn(1000, 10).astype(numpy.float64)
    y = X.sum(axis=1).reshape((-1, 1))

    model = RadiusNeighborsRegressor()
    model.fit(X, y)
    onx = to_onnx(model, X, options={'optim': 'cdist'})

    with open(filename, "wb") as f:
        f.write(onx.SerializeToString())

#####################################
# Functions
# +++++++++
#
# We need to generate random inputs to test the graph.


def random_input(typ, shape, batch):
    if typ == 'tensor(double)':
Пример #14
0
def process_data(data):
    """
    data: input_true, input_reco, ghost_label, group_label
    returns: input, output
        input: intersection between reco and true, labeled with reco charge depositions
        output: intersection between reco and true, labeled with adjusted energy depositions
    """
    input_true = data['input_true']
    input_reco = data['input_reco']
    segment_label = data['segment_label']
    group_label = data['group_label']

    chosen_indices = []
    chosen_reco_indices = []

    current_batch = 0
    current_batch_selection = np.where(input_true[:, -2] == current_batch)[0]
    current_input_true = input_true[current_batch_selection]
    for r in range(len(input_reco)):
        row = input_reco[r]
        b = row[-2]
        if b != current_batch:
            current_batch = b
            current_batch_selection = np.where(
                input_true[:, -2] == current_batch)[0]
        pos = row[:3]
        region_selection = np.where((current_input_true[:, 0] == pos[0])
                                    & (current_input_true[:, 1] == pos[1]))[0]
        input_true_region = current_input_true[region_selection]
        for i in range(len(input_true_region)):
            row2 = input_true_region[i]
            pos2 = row2[:3]
            if np.array_equal(pos, pos2):
                chosen_indices.append(
                    current_batch_selection[region_selection[i]])
                chosen_reco_indices.append(r)
                break

    if len(chosen_indices) == 0:
        return None

    chosen_indices = np.array(chosen_indices)
    chosen_reco_indices = np.array(chosen_reco_indices)

    lost_data = np.delete(input_true, chosen_indices, axis=0)
    found_data = input_true[chosen_indices]

    # find where the chosen indices are in the group data
    lost_group_data = -np.ones((len(lost_data), len(lost_data[0])))
    ungrouped_data = -np.ones((len(lost_data), len(lost_data[0])))
    found_group_data = -np.ones((len(found_data), len(found_data[0])))
    for i in range(len(lost_data)):
        row = lost_data[i]
        filter0 = group_label[np.where(group_label[:, -2] == row[-2])]
        filter1 = filter0[np.where(filter0[:, 0] == row[0])]
        filter2 = filter1[np.where(filter1[:, 1] == row[1])]
        filter3 = filter2[np.where(filter2[:, 2] == row[2])]
        if len(filter3) == 0:
            ungrouped_data[i] = row
        else:
            g = filter3[0]
            lost_group_data[i] = g
    for i in range(len(found_data)):
        row = found_data[i]
        filter0 = group_label[np.where(group_label[:, -2] == row[-2])]
        filter1 = filter0[np.where(filter0[:, 0] == row[0])]
        filter2 = filter1[np.where(filter1[:, 1] == row[1])]
        filter3 = filter2[np.where(filter2[:, 2] == row[2])]
        g = filter3[0]
        found_group_data[i] = g

#     lost_group_data = np.delete(group_label, chosen_indices, axis=0)
#     found_group_data = group_label[chosen_indices]

    if ADD_MISSING_ENERGY:
        batches = np.unique(input_true[:, 3])
        for b in batches:
            # nearest neighbor assignment within group
            found_groups = np.unique(
                found_group_data[np.where(found_group_data[:, 3] == b)][:, -1])
            lost_batch_mask = lost_group_data[:, 3] == b
            found_batch_mask = found_group_data[:, 3] == b
            for g in found_groups:
                lost_selection = np.where(lost_batch_mask
                                          & (lost_group_data[:, -1] == g))[0]
                found_selection = np.where(found_batch_mask
                                           & (found_group_data[:, -1] == g))[0]
                ldata = lost_data[lost_selection]
                fdata = found_data[found_selection]
                lost_positions = ldata[:, :3]
                found_positions = fdata[:, :3]
                distances = distance_matrix(lost_positions, found_positions)
                closest_points = np.argmin(distances, axis=1)
                closest_energies = ldata[:, -1]
                for i in range(len(closest_points)):
                    found_data[found_selection[
                        closest_points[i]]][-1] += closest_energies[i]

            # associated ungrouped voxels with nearest voxels, regardless of group
            lost_ungrouped = np.where((ungrouped_data[:, 3] == b))[0]
            if len(lost_ungrouped) > 0:
                found_selection = np.where(found_batch_mask)[0]
                ldata = lost_data[lost_ungrouped]
                fdata = found_data[found_selection]
                lost_positions = ldata[:, :3]
                found_positions = fdata[:, :3]
                distances = distance_matrix(lost_positions, found_positions)
                closest_points = np.argmin(distances, axis=1)
                closest_energies = ldata[:, -1]
                for i in range(len(closest_points)):
                    found_data[found_selection[
                        closest_points[i]]][-1] += closest_energies[i]

    if BLUR_ENERGY:
        blur_kernel = 3
        for g in np.unique(found_group_data[:, -1]):
            inds = np.where(found_group_data[:, -1] == g)
            selection = found_data[inds]
            total_energy = np.sum(selection[:, -1])

            coords = selection[:, :3]
            energies = selection[:, -1]
            neigh = RadiusNeighborsRegressor(radius=blur_kernel)
            neigh.fit(coords, energies)
            selection[:, -1] = neigh.predict(coords)
            selection[:, -1] *= total_energy / np.sum(selection[:, -1])
            found_data[inds, -1] = selection[:, -1]

    segment_indices = segment_label[chosen_indices, -1].astype(int)
    segment_one_hot = np.zeros((len(segment_indices), 5))
    segment_one_hot[np.arange(len(segment_indices)), segment_indices] = 1
    out = np.concatenate((input_reco[chosen_reco_indices], segment_one_hot,
                          np.expand_dims(found_data[:, -1], axis=1)),
                         axis=1)
    return np.array(out), found_group_data[:, -1]
Пример #15
0
for i in range(0, len(y)-1):
    if y[i]>10000000:
        y[i]=10000000


### RadiusNeighborsRegressor ###

from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.preprocessing import StandardScaler


kf = KFold(len(y), n_folds=15, shuffle=True)

y_pred = np.zeros(len(y), dtype=y.dtype) # where we'll accumulate predictions

clf = RadiusNeighborsRegressor(radius=15)




# CV Loop
for train_index, test_index in kf:
    # for each iteration of the for loop we'll do a test train split
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    t = StandardScaler()
    X_train = t.fit_transform(X_train)
    clf.fit(X_train, y_train) # Train clf_1 on the training data

    X_test = t.transform(X_test)
Пример #16
0
model = LR(C=0.01, penalty='l1')
from sklearn.linear_model import BayesianRidge as BR
model = BR(alpha_1=1e2,
           alpha_2=3e2,
           lambda_1=1e-9,
           lambda_2=1e-9,
           compute_score=False)
from sklearn.linear_model import (LinearRegression, Lasso, RandomizedLasso,
                                  Ridge)
from sklearn.feature_selection import (RFE, f_regression)
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.neighbors import NearestNeighbors
model = RadiusNeighborsRegressor(radius=0.5, p=2)
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=10,max_depth=8,\
min_samples_split=2)
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor(n_estimators=400)
from sklearn.ensemble import GradientBoostingRegressor
model=GradientBoostingRegressor(n_estimators=100,\
learning_rate=0.1,max_depth=10)
from sklearn.ensemble import BaggingRegressor
mb = model
model=BaggingRegressor(base_estimator=mb,n_estimators=20,bootstrap=1,\
bootstrap_features=1,max_samples=0.3,max_features=0.3)
model = LR(C=0.004)
model = LR(C=0.01, penalty='l1')
model=rfr(n_estimators=2000,max_depth=1,min_samples_leaf=20,\
          'mse_train','mse_test','mae_train','mae_test',
          'mdae_train','mdae_test']
reg=[linear_model.LinearRegression(),
     linear_model.Ridge(),linear_model.RidgeCV(),
     linear_model.Lasso(),linear_model.LassoLarsCV(),
     linear_model.RANSACRegressor(),
     linear_model.BayesianRidge(),linear_model.ARDRegression(),
     linear_model.HuberRegressor(),linear_model.TheilSenRegressor(),
     PLSRegression(),DecisionTreeRegressor(),ExtraTreeRegressor(),
     BaggingRegressor(),AdaBoostRegressor(),
     GradientBoostingRegressor(),RandomForestRegressor(),
     linear_model.PassiveAggressiveRegressor(max_iter=1000,tol=0.001),
     linear_model.ElasticNet(),
     linear_model.SGDRegressor(max_iter=1000,tol=0.001),
     svm.SVR(),KNeighborsRegressor(),
     RadiusNeighborsRegressor(radius=1.5),GaussianProcessRegressor()]

list1reg=['LinearRegression','Ridge','RidgeCV',
          'Lasso','LassoLarsCV','RANSACRegressor',
          'BayesianRidge','ARDRegression','HuberRegressor',
          'TheilSenRegressor','PLSRegression','DecisionTreeRegressor',
          'ExtraTreeRegressor','BaggingRegressor','AdaBoostRegressor',
          'GradientBoostingRegressor','RandomForestRegressor']
y1reg=[]; y7reg=[]
for i in range(len(list1reg)):
    y1reg.append(regressor_fit_score(reg[i],list1reg[i],'Boston',
                                     X_train1,X_test1,y_train1,y_test1)[:2])
[[y_train101,y_test101],[y_train102,y_test102],[y_train103,y_test103],
 [y_train104,y_test104],[y_train105,y_test105],[y_train106,y_test106],
 [y_train107,y_test107],[y_train108,y_test108],[y_train109,y_test109],
 [y_train110,y_test110],[y_train111,y_test111],[y_train112,y_test112],
Пример #18
0
colour = sp.reshape(df.colour, (-1, 1))
    #reshape the colour to a column vector for use in the algorithm
    
designation = sp.array(df.designation.tolist())

temp = sp.array(df.teff.tolist())

"""
possibly remove SVC, takes long time (~4 mins per fold)
"""

folds = 2

names = ['KNeighbours', 'Radius Neighbours', 'Random Forest Regressor',
             'Linear Regression', 'Gaussian Process Regressor', 'Ada Boost Classifier']
classifiers = [KNeighborsRegressor(), RadiusNeighborsRegressor(), RandomForestRegressor(),
               LinearRegression(), GaussianProcessRegressor(), AdaBoostRegressor()]
    #load the random forest clssifier

kf = cross_validation.KFold(n = len(colour), n_folds = folds, shuffle = True)
    #use kfolds to split the data

final = []
MAD = []

for name, clf in zip(names, classifiers):
    
    ###importance = []

    models = sp.array([[sp.nan]*len(temp)]*folds)
    
Пример #19
0
del globals()['profilesDF']
del globals()['profiles']
del globals()['profilesLSo']
del globals()['profilesLS']
del globals()['row']
del globals()['tmpLS']
del globals()['tmpAGE']
del globals()['profsTOlikes']
del globals()['i']
del globals()['tmpIND']

seed = 7
myRand = np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(likesMAT,
                                                    consARR,
                                                    test_size=1500)

myRAD = float(sys.argv[1])
radNN = RadiusNeighborsRegressor(radius=myRAD)

#radNN.fit(likesMAT, consARR)
radNN.fit(X_train, y_train)

y_pred = radNN.predict(X_test)
import math
myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("cons, Radius neighbors:  ", str(myRAD), " ", myRMSE)

# joblib.dump(radNN, "/Users/jamster/radNN-A-cons.xz", compress=9)

# impRadNN = joblib.load("/Users/jamster/radNN-A-cons.xz")
Пример #20
0
 def RNN_Build(self, train_x, train_y):
     """RNN_Build"""
     self.rneigh = RadiusNeighborsRegressor(radius=1.0)
     self.rneigh.fit(train_x, train_y)
    linear_model.ARDRegression(),
    linear_model.HuberRegressor(max_iter=800),
    linear_model.TheilSenRegressor(max_iter=800),
    PLSRegression(),
    DecisionTreeRegressor(),
    ExtraTreeRegressor(),
    BaggingRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor(),
    RandomForestRegressor(),
    linear_model.PassiveAggressiveRegressor(max_iter=800, tol=.001),
    linear_model.ElasticNet(max_iter=800),
    linear_model.SGDRegressor(max_iter=800, tol=.001),
    svm.SVR(),
    KNeighborsRegressor(),
    RadiusNeighborsRegressor(radius=1.5),
    GaussianProcessRegressor()
]

listreg = [
    'LinearRegression', 'Ridge', 'RidgeCV', 'Lasso', 'LassoLarsCV',
    'RANSACRegressor', 'BayesianRidge', 'ARDRegression', 'HuberRegressor',
    'TheilSenRegressor', 'PLSRegression', 'DecisionTreeRegressor',
    'ExtraTreeRegressor', 'BaggingRegressor', 'AdaBoostRegressor',
    'GradientBoostingRegressor', 'RandomForestRegressor'
]
yreg = []
for i in range(len(listreg)):
    yreg.append(
        regressor_fit_score(reg[i], listreg[i], 'Boston', x_train, x_test,
                            y_train, y_test)[:2])
Пример #22
0
    def estimate_aba_ge(self, entrez_ids, coords=None, **kwargs):
        """
        Retrieves, estimates and stores gene expression coefficients in ABA dictionary based on a
        a passed list of NIH Entrez IDs.

        Parameters
        ----------
        entrez_ids: List-like
            list-like structure containing NIH Entrez IDs.

        kwargs : dict, optional
            OPTIONS:
                'rnn_args' : dict
                    SKLearn RadiusNeighborsRegressor() optional arguments.
                    http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsRegressor.html
                    for default arguments.
        """

        self._check_entrez_struct(entrez_ids)

        for entrez_id in entrez_ids:
            # Fetch probe IDs for Entrez ID
            probe_ids = self._aba['probe_df'].loc[
                self._aba['probe_df']['entrez_id'] ==
                entrez_id]['probe_id'].tolist()

            if len(probe_ids) == 0:
                print 'Entrez ID: %s not registered with ABA database' % entrez_id
                continue

            # Return gene expression on given probes across sampled locations.
            ge_df = self._aba['exp_df'].loc[self._aba['exp_df']
                                            ['probe_id'].isin(probe_ids)]
            ge_mat = ge_df.as_matrix().astype(float)[:, 1:]

            # Take average gene expression across probes at a given sampled location.
            ge_vec = np.mean(ge_mat, axis=0)

            self.ge[entrez_id] = {}
            for probe in probe_ids:
                self.ge[entrez_id][probe] = {}
            self.ge[entrez_id]["mean"] = {}

            # z scoring method
            if 'z_score' in kwargs:
                for row in xrange(ge_mat.shape[0]):
                    ge_mat[row] = (ge_mat[row] -
                                   ge_mat[row].mean()) / ge_mat[row].std()
                ge_vec = (ge_vec - ge_vec.mean()) / ge_vec.std()

            if coords is None:
                for row, probe in enumerate(probe_ids):
                    self.ge[entrez_id][probe]['GE'] = ge_mat[row]
                self.ge[entrez_id]["mean"]['GE'] = ge_vec
                self.ge[entrez_id]['coord_type'] = 'ABA'

            # Estimate gene expression at custom coordinates
            else:
                X = self._aba['mni_coords'].data
                y_mean = ge_vec
                valid_inds = self._check_coords_for_distance_weighting(
                    coords=coords,
                    check_radius=kwargs['rnn_args']['radius'],
                    check_weights='distance',
                    X=X,
                    y_mean=y_mean)

                if 'rnn_args' in kwargs:
                    if 'radius' not in kwargs['rnn_args']:
                        kwargs['rnn_args']['radius'] = 5
                    if 'radius' in kwargs['rnn_args']:
                        if kwargs['rnn_args']['radius'] == 1:
                            kwargs['weights'] = 'uniform'
                    if 'weights' not in kwargs['rnn_args']:
                        kwargs['weights'] = 'uniform'
                    if 'weights' != 'distance':
                        self._gaussian_weight_radius = kwargs['rnn_args'][
                            'radius']
                    for row, probe in enumerate(probe_ids):
                        self.ge[entrez_id][probe][
                            'classifier'] = RadiusNeighborsRegressor(
                                **kwargs['rnn_args'])
                    self.ge[entrez_id]["mean"][
                        'classifier'] = RadiusNeighborsRegressor(
                            **kwargs['rnn_args'])
                else:
                    for row, probe in enumerate(probe_ids):
                        self.ge[entrez_id][probe][
                            'classifier'] = RadiusNeighborsRegressor(
                                radius=5, weights='uniform')
                    self.ge[entrez_id]["mean"][
                        'classifier'] = RadiusNeighborsRegressor(
                            radius=5, weights='uniform')

                for row, probe in enumerate(probe_ids):
                    self.ge[entrez_id][probe]['classifier'].fit(X, ge_mat[row])
                self.ge[entrez_id]["mean"]['classifier'].fit(X, y_mean)
                if 'store_coords' in kwargs:
                    if kwargs['store_coords']:
                        self.ge[entrez_id]['coords'] = coords

                if 'coord_type' in kwargs:
                    self.ge[entrez_id]['coord_type'] = kwargs['coord_type']
                else:
                    self.ge[entrez_id]['coord_type'] = 'Custom'

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    nan_array = np.empty(len(coords))
                    nan_array[:] = np.nan
                    for row, probe in enumerate(probe_ids):
                        self.ge[entrez_id][probe]["GE"] = nan_array
                        if len(valid_inds) > 0:
                            estimations = self.ge[entrez_id][probe][
                                'classifier'].predict(
                                    [coords[i] for i in valid_inds])
                            for vi in xrange(len(valid_inds)):
                                self.ge[entrez_id][probe]["GE"][
                                    valid_inds[vi]] = estimations[vi]

                    self.ge[entrez_id]["mean"]["GE"] = nan_array
                    if len(valid_inds) > 0:
                        estimations = self.ge[entrez_id]["mean"][
                            'classifier'].predict(
                                [coords[i] for i in valid_inds])
                        for vi in xrange(len(valid_inds)):
                            self.ge[entrez_id]["mean"]["GE"][vi] = estimations[
                                vi]
Пример #23
0
def process_data(input_true, input_reco, segment_label, group_label):
    """
    arguments are Nx5 from processing data
    input_true: energy depositions
    input_reco: charge depositions
    segment_label: fivetypes label
    group_label: particle instance
    
    purpose is to get find M non-ghost reco voxels and set target energies for them based on blurring
    
    returns tuple of neural network inputs and other useful stuff (it's messy, sorry)
    element 0: [size Mx12] corresponding to input_reco (5) + one-hot encoded fivetypes+ghost (6) + blurred energy target (1)
    element 1: [size M] group label of voxel
    element 2: [size M] indices in input_true of voxels that have been reconstructed
    element 3: [size Mx5] input_true intersection with reco, where the last element in each row is blurred energy
    
    """
    chosen_indices = []
    chosen_reco_indices = []

    current_batch = 0
    current_batch_selection = np.where(input_true[:, -2] == current_batch)[0]
    current_input_true = input_true[current_batch_selection]
    for r in range(len(input_reco)):
        row = input_reco[r]
        b = row[-2]
        if b != current_batch:
            current_batch = b
            current_batch_selection = np.where(
                input_true[:, -2] == current_batch)[0]
        pos = row[:3]
        region_selection = np.where((current_input_true[:, 0] == pos[0])
                                    & (current_input_true[:, 1] == pos[1]))[0]
        input_true_region = current_input_true[region_selection]
        for i in range(len(input_true_region)):
            row2 = input_true_region[i]
            pos2 = row2[:3]
            if np.array_equal(pos, pos2):
                chosen_indices.append(
                    current_batch_selection[region_selection[i]])
                chosen_reco_indices.append(r)
                break

    if len(chosen_indices) == 0:
        return None

    chosen_indices = np.array(chosen_indices)
    chosen_reco_indices = np.array(chosen_reco_indices)

    lost_data = np.delete(input_true, chosen_indices, axis=0)
    found_data = input_true[chosen_indices]

    # find where the chosen indices are in the group data
    lost_group_data = -np.ones((len(lost_data), len(lost_data[0])))
    ungrouped_data = -np.ones((len(lost_data), len(lost_data[0])))
    found_group_data = -np.ones((len(found_data), len(found_data[0])))
    for i in range(len(lost_data)):
        row = lost_data[i]
        filter0 = group_label[np.where(group_label[:, -2] == row[-2])]
        filter1 = filter0[np.where(filter0[:, 0] == row[0])]
        filter2 = filter1[np.where(filter1[:, 1] == row[1])]
        filter3 = filter2[np.where(filter2[:, 2] == row[2])]
        if len(filter3) == 0:
            ungrouped_data[i] = row
        else:
            g = filter3[0]
            lost_group_data[i] = g
    for i in range(len(found_data)):
        row = found_data[i]
        filter0 = group_label[np.where(group_label[:, -2] == row[-2])]
        filter1 = filter0[np.where(filter0[:, 0] == row[0])]
        filter2 = filter1[np.where(filter1[:, 1] == row[1])]
        filter3 = filter2[np.where(filter2[:, 2] == row[2])]
        g = filter3[0]
        found_group_data[i] = g

    if ADD_MISSING_ENERGY:
        batches = np.unique(input_true[:, 3])
        for b in batches:
            # nearest neighbor assignment within group
            found_groups = np.unique(
                found_group_data[np.where(found_group_data[:, 3] == b)][:, -1])
            lost_batch_mask = lost_group_data[:, 3] == b
            found_batch_mask = found_group_data[:, 3] == b
            for g in found_groups:
                lost_selection = np.where(lost_batch_mask
                                          & (lost_group_data[:, -1] == g))[0]
                found_selection = np.where(found_batch_mask
                                           & (found_group_data[:, -1] == g))[0]
                ldata = lost_data[lost_selection]
                fdata = found_data[found_selection]
                lost_positions = ldata[:, :3]
                found_positions = fdata[:, :3]
                distances = distance_matrix(lost_positions, found_positions)
                closest_points = np.argmin(distances, axis=1)
                closest_energies = ldata[:, -1]
                for i in range(len(closest_points)):
                    found_data[found_selection[
                        closest_points[i]]][-1] += closest_energies[i]

            # associated ungrouped voxels with nearest voxels, regardless of group
            lost_ungrouped = np.where((ungrouped_data[:, 3] == b))[0]
            if len(lost_ungrouped) > 0:
                found_selection = np.where(found_batch_mask)[0]
                ldata = lost_data[lost_ungrouped]
                fdata = found_data[found_selection]
                lost_positions = ldata[:, :3]
                found_positions = fdata[:, :3]
                distances = distance_matrix(lost_positions, found_positions)
                closest_points = np.argmin(distances, axis=1)
                closest_energies = ldata[:, -1]
                for i in range(len(closest_points)):
                    found_data[found_selection[
                        closest_points[i]]][-1] += closest_energies[i]

    if BLUR_ENERGY:
        blur_kernel = 3
        for g in np.unique(found_group_data[:, -1]):
            inds = np.where(found_group_data[:, -1] == g)
            selection = found_data[inds]
            total_energy = np.sum(selection[:, -1])

            coords = selection[:, :3]
            energies = selection[:, -1]
            neigh = RadiusNeighborsRegressor(radius=blur_kernel)
            neigh.fit(coords, energies)
            selection[:, -1] = neigh.predict(coords)
            selection[:, -1] *= total_energy / np.sum(selection[:, -1])
            found_data[inds, -1] = selection[:, -1]

    segment_indices = segment_label[chosen_indices, -1].astype(int)
    segment_one_hot = np.zeros((len(segment_indices), 5))
    segment_one_hot[np.arange(len(segment_indices)), segment_indices] = 1
    out = np.concatenate((input_reco[chosen_reco_indices], segment_one_hot,
                          np.expand_dims(found_data[:, -1], axis=1)),
                         axis=1)
    return np.array(out), found_group_data[:, -1], chosen_indices, found_data
Пример #24
0
# Read training dataset
df = pd.read_csv(TRAINING_DATASET, header=None)		# read from the first line

columns = len(df.columns)
rows = len(df.index)

print 'Training dataset:', "{:,}".format(len(df.index)), 'x', "{:,}".format(len(df.columns))

df_y = df.ix[:,columns-1]
df_x = df.ix[:,:columns-2]

X = np.array(df_x)
Y = np.array(df_y)

neigh = RadiusNeighborsRegressor(radius = KNN_RADIUS)
neigh.fit(X, Y)

# Read Test dataset
testFiles = [file for file in os.listdir(TEST_DATASET_DIRECTORY) if str(file).find('test') >= 0]
print 'Number of test files:', len(testFiles)

TEST_Y_ALL = np.array([])
TEST_Y_ALL_PREDICTED = np.array([])
for file in testFiles:
	df = pd.read_csv(TEST_DATASET_DIRECTORY + '/' + file, header=None)		# read from the first line
	df_y = df.ix[:,columns-1]
	df_x = df.ix[:,:columns-2]
	
	X = np.array(df_x)
	Y = np.array(df_y)
Пример #25
0
def regress(X_train, y_train):
    # comment out any classifier that should not be used
    classifiers = [
        (SGDRegressor(), "SGDRegressor", 1 * global_data_scale),
        (LinearRegression(), "LinearRegression", 1 * global_data_scale),
        (Ridge(), "Ridge", 1 * global_data_scale),
        (Lasso(), "Lasso", 1 * global_data_scale),
        (ElasticNet(), "ElasticNet", 1 * global_data_scale),
        (Lars(), "Lars", 1 * global_data_scale),
        (OrthogonalMatchingPursuit(), "OrthogonalMatchingPursuit", 1 * global_data_scale),
        (BayesianRidge(), "BayesianRidge", 1 * global_data_scale),
        (ARDRegression(), "ARDRegression", 1 * global_data_scale),
        ### NOTE the scoring might be different of PassiveAggressiveRegressor
        (PassiveAggressiveRegressor(), "PassiveAggressiveRegressor", 1 * global_data_scale),
        ### NOTE the scoring might be different of RANSACRegressor
        (RANSACRegressor(), "RANSACRegressor", 1 * global_data_scale),
        (TheilSenRegressor(), "TheilSenRegressor", 1 * global_data_scale),
        (HuberRegressor(), "HuberRegressor", 1 * global_data_scale),
        (DecisionTreeRegressor(), "DecisionTreeRegressor", 1 * global_data_scale),
        (GaussianProcessRegressor(), "GaussianProcessRegressor", 1 * global_data_scale),
        (MLPRegressor(), "MLPRegressor", 1 * global_data_scale),
        (KNeighborsRegressor(), "KNeighborsRegressor", 1 * global_data_scale),
        (RadiusNeighborsRegressor(), "RadiusNeighborsRegressor", 1 * global_data_scale),
        (SVR(), "SVR", 1 * global_data_scale),
        (NuSVR(), "NuSVR", 1 * global_data_scale),
        (LinearSVR(), "LinearSVR", 1 * global_data_scale),
        (KernelRidge(), "KernalRidge", 1 * global_data_scale),
        (IsotonicRegression(), "IsotonicRegression", 1 * global_data_scale)
    ]

    # set the list of the values that should be used in grid search
    params_dict = {
        "SGDRegressor": {
            "penalty": ["l2", "l1"],
            "alpha": [.001, .0001, .00001],
            "l1_ratio": [.15, .2, .25],
            "fit_intercept": [True, False],
            "max_iter": [1000],
            "shuffle": [True, False],
            "epsilon": [.05, .1, .2],
            "learning_rate": ["constant", "optimal", "invscaling", "adaptive"],
            "eta0": [.005, .01, .02],
            "power_t": [.2, .25, .3]
        },
        "LinearRegression": {
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "Ridge": {
            "alpha": [.8, 1., 1.2],
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "tol": [.01, .001, .0001],
            "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
        },
        "Lasso": {
            "alpha": [.8, 1., 1.2],
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "positive": [True, False],
            "precompute": [True, False]
        },
        "ElasticNet": {
            "alpha": [.8, 1., 1.2],
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "precompute": [True, False],
            "positive": [True, False],
            "selection": ["cyclic", "random"]
        },
        "Lars": {
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "precompute": [True, False],
            "n_nonzero_coefs": [np.inf]
        },
        "OrthogonalMatchingPursuit": {
            "n_nonzero_coefs": [np.inf, None],
            "precompute": [True, False],
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "BayesianRidge": {
            "tol": [.01, .001, .0001],
            "alpha_1": [1e-5, 1e-6, 1e-7],
            "alpha_2": [1e-5, 1e-6, 1e-7],
            "lambda_1": [1e-5, 1e-6, 1e-7],
            "lambda_2": [1e-5, 1e-6, 1e-7],
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "ARDRegression": {
            "tol": [.01, .001, .0001],
            "alpha_1": [1e-5, 1e-6, 1e-7],
            "alpha_2": [1e-5, 1e-6, 1e-7],
            "lambda_1": [1e-5, 1e-6, 1e-7],
            "lambda_2": [1e-5, 1e-6, 1e-7],
            "threshold_lambda": [1000, 10000, 100000],
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "PassiveAggressiveRegressor": {
            "C": [.8, 1., 1.2 ],
            "tol": [1e-2, 1e-3, 1e-4],
            "n_iter_no_change": [3, 5, 8],
            "shuffle": [True, False],
            "average": [True, False]
        },
        "RANSACRegressor": {
            "base_estimator": [LinearRegression()]
        },
        "TheilSenRegressor": {
            "max_subpopulation": [1e3, 1e4, 1e5],
            "tol": [1e-2, 1e-3, 1e-4]
        },
        "HuberRegressor": {
            "epsilon": [1.1, 1.35,  1.5],
            "alpha": [1e-3, 1e-4, 1e-5],
            "warm_start": [True, False],
            "fit_intercept": [True, False],
            "": [1e-4, 1e-5, 1e-6]
        },
        "DecisionTreeRegressor": {
            "criterion": ["mse", "friedman_mse", "mae"],
            "splitter": ["best", "random"],
            "min_samples_split": [2, 3],
            "min_samples_leaf": [1, 2],
            "min_weight_fraction_leaf": [.0],
            "max_features": ["auto", "sqrt", "log2"],
            "min_impurity_split": [1e-6, 1e-7, 1e-8]
        },
        "GaussianProcessRegressor": {
            "alpha": [1e-8, 1e-10, 1e-12],
            "optimizer": ["fmin_l_bfgs_b"],
            "normalize_y": [True, False]
        },
        "MLPRegressor": {
            "hidden_layer_sizes": [(100,)],
            "activation": ["identity", "logistic", "tanh", "relu"],
            "solver": ["lbfgs", "sgd", "adam"],
            "alpha": [1e-3, 1e-4, 1e-5],
            # "learning_rate": ["constant", "invscaling", "adaptive"],
            # "learning_rate_init": [1e-2, 1e-3, 1e-4],
            # "power_t": [.3, .5, .8],
            # "shuffle": [True, False],
            # "tol": [1e-3, 1e-4, 1e-5],
            # "momentum": [.8, .9, .99],
            # "beta_1": [.8, .9, .99],
            # "beta_2": [.999],
            # "epsilon": [1e-7, 1e-8, 1e-9],
            # "n_iter_no_change": [10],
            # "max_fun": [15000]
        },
        "KNeighborsRegressor": {
            "n_neighbors": [20, 10, 5, 3],
            "weights": ["uniform", "distance"],
            "algorithm": ["ball_tree", "kd_tree", "brute"],
            "leaf_size": [20, 30, 40],
            "p": [1, 2]
        },
        "RadiusNeighborsRegressor": {
            "radius": [.8, 1, 1.2],
            "n_neighbors": [20, 10, 5, 3],
            "weights": ["uniform", "distance"],
            "algorithm": ["ball_tree", "kd_tree", "brute"],
            "leaf_size": [20, 30, 40],
            "p": [1, 2]
        },
        "SVR": {
            "kernel": ["poly", "rbf", "sigmoid"],
            "degree": [2, 3, 5],
            "gamma": ["scale", "auto"],
            "coef0": [.0],
            "tol": [1e-2, 1e-3, 1e-4],
            "C": [.8, .1, 1.2],
            "epsilon": [.08, .1, .12],
            "shrinking": [True, False],
            "max_iter": [-1]
        },
        "NuSVR": {
            "nu": [.2, .5, .8],
            "C": [.8, .1, 1.2],
            "kernel": ["poly", "rbf", "sigmoid"],
            "degree": [2, 3, 5],
            "gamma": ["scale", "auto"],
            "coef0": [.0],
            "shrinking": [True, False],
            "tol": [1e-2, 1e-3, 1e-4],
            "max_iter": [-1]
        },
        "LinearSVR": {
            "epsilon": [.0],
            "tol": [1e-3, 1e-4, 1e-5],
            "C": [.8, .1, 1.2],
            "fit_intercept": [True, False],
            "dual": [True, False],
            "intercept_scaling": [.8, 1., 1.2]
        },
        "KernelRidge": {
            "coef0": [.8, 1, 1.2],
            "degree": [2, 3, 5],
        },
        "IsotonicRegression": {
            "increasing": [True, False],
        }
    }

    for model, params, frac in classifiers:
        full = pd.DataFrame(X_train).join(pd.DataFrame(y_train))
        loan_data = full.sample(frac=frac, random_state=random_state)
        X = loan_data.drop("loan_status", axis=1)
        y = loan_data["loan_status"]
        grid = GridSearchCV(model, params_dict[params], verbose=verbose, cv=folds, n_jobs=workers)
        grid.fit(X, y)
        yield grid, params
def main():

    # let's create a folder with a unique name to store results
    folderName = datetime.datetime.now().strftime(
        "%Y-%m-%d-%H-%M") + "-regression"
    if not os.path.exists(folderName): os.makedirs(folderName)

    # initialize logging
    common.initialize_logging(folderName)

    regressorsList = [

        # human-designed regressors
        [
            HumanRegressor("y = a_0 + a_1 * x + a_2 * x**2 + a_3 * x**3",
                           map_variables_to_features={"x": 0}),
            "HumanRegressor"
        ],
        [PolynomialRegressor(2), "PolynomialRegressor2"],
        #[PolynomialRegressor(3), "PolynomialRegressor3"],
        # keras neural network
        #[ANNRegressor(epochs=500, batch_size=32, layers=[16,4]), "KerasRegressor8-4"],
        #[ANNRegressor(epochs=700, batch_size=32, layers=[16,8]), "KerasRegressor16-8"],

        # cross decomposition
        [PLSRegression(), "PLSRegression"],

        # ensemble
        [AdaBoostRegressor(), "AdaBoostRegressor"],
        [BaggingRegressor(), "BaggingRegressor"],
        [BaggingRegressor(n_estimators=100), "BaggingRegressor_100"],
        [BaggingRegressor(n_estimators=300), "BaggingRegressor_300"],
        [ExtraTreesRegressor(), "ExtraTreesRegressor"],
        [GradientBoostingRegressor(), "GradientBoostingRegressor"],
        [RandomForestRegressor(), "RandomForestRegressor"],
        [RandomForestRegressor(n_estimators=100), "RandomForestRegressor_100"],
        [RandomForestRegressor(n_estimators=300), "RandomForestRegressor_300"],

        # isotonic
        #[IsotonicRegression(), "IsotonicRegression"], # apparently wants "X" as a 1d array

        # kernel ridge
        [KernelRidge(), "KernelRidge"],

        # linear
        #[ARDRegression(), "ARDRegression"], # takes too much time to train
        [BayesianRidge(), "BayesianRidge"],
        [ElasticNetCV(), "ElasticNetCV"],
        [LarsCV(), "LarsCV"],
        [LassoCV(), "LassoCV"],
        [LinearRegression(), "LinearRegression"],
        [PassiveAggressiveRegressor(), "PassiveAggressiveRegressor"],

        # neighbors
        [KNeighborsRegressor(), "KNeighborsRegressor"],
        [RadiusNeighborsRegressor(), "RadiusNeighborsRegressor"],

        # neural networks
        #[BernoulliRBM(), "BernoulliRBM"], # has a different interface, no "predict"

        # svm
        [SVR(), "SVR"],
        [LinearSVR(), "LinearSVR"],
        [NuSVR(), "NuSVR"],

        # tree
        [DecisionTreeRegressor(), "DecisionTreeRegressor (max depth 10)"],
        [ExtraTreeRegressor(), "ExtraTreeRegressor"],

        # generalized additive models
        [LinearGAM(n_splines=20), "LinearGAM(n_splines=20)"],

        # gaussian processes
        [
            GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel()),
            "GaussianProcessRegressor"
        ],
    ]

    X = y = X_train = X_test = y_train = y_test = variablesX = variablesY = None
    numberOfSplits = 10  # TODO change number of splits from command line

    if True:
        # this is just a dumb benchmark
        X, y, variablesX, variablesY = common.loadEasyBenchmark()

    if False:
        X, y, variablesX, variablesY = common.loadChristianQuestionnaireRegression(
        )

    if False:
        X, y, variablesX, variablesY = common.loadYongShiDataCalibration2(
            "TIMBER")

    if False:
        X, y, variablesX, variablesY = common.loadLaurentBouvierNewData()

    if False:
        X, y, variablesX, variablesY = common.loadYongShiDataCalibration()

    if False:
        from sklearn.datasets import load_linnerud
        X, y = load_linnerud(return_X_y=True)

    if False:
        X, y, variablesX, variablesY = common.loadYingYingData()

    if False:
        X, y, variablesX, variablesY = common.loadCleaningDataGermanSpecific()
        #X, y, variablesX, variablesY = common.loadCleaningDataGerman()

    if False:
        X, y, variablesX, variablesY = common.loadInsects()

    if False:
        X, y, variablesX, variablesY = common.loadMilkProcessPipesDimensionalAnalysis(
        )
        #X, y, variablesX, variablesY = common.loadMilkProcessPipes()

    if False:  # ecosystem services
        X, y, variablesX, variablesY = common.loadEcosystemServices()

    if False:
        X, y, variablesX, variablesY = common.loadMarcoSoil()

    if False:
        # load dataset
        X, y = common.loadEureqaRegression()
        # randomly split between training and test
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    if False:
        # load dataset
        X_train, X_test, y_train, y_test = common.loadBiscuitExample()
        logging.info("X_train: " + str(X_train.shape))
        logging.info("X_test: " + str(X_test.shape))
        logging.info("y_train: " + str(y_train.shape))
        logging.info("y_test: " + str(y_test.shape))

        # in this particular case, I create the "global" X and y by putting together the two arrays
        X = np.append(X_train, X_test, axis=0)
        y = np.append(y_train, y_test, axis=0)

    if False:
        # load dataset
        X_train, X_test, y_train, y_test = common.loadAromoptiExample()
        logging.info("X_train: " + str(X_train.shape))
        logging.info("X_test: " + str(X_test.shape))
        logging.info("y_train: " + str(y_train.shape))
        logging.info("y_test: " + str(y_test.shape))

        # in this particular case, I create the "global" X and y by putting together the two arrays
        X = np.append(X_train, X_test, axis=0)
        y = np.append(y_train, y_test, axis=0)

    logging.info(
        "Regressing %d output variables, in function of %d input variables..."
        % (y.shape[1], X.shape[1]))

    # if the names of the variables are not specified, let's specify them!
    if variablesY is None:
        variablesY = ["y" + str(i) for i in range(0, len(y[0]))]
    if variablesX is None:
        variablesX = ["X" + str(i) for i in range(0, len(X[0]))]

    performances = dict()

    for variableIndex, variableY in enumerate(variablesY):

        logging.info("** Now evaluating models for variable \"%s\"... **" %
                     variableY)

        # obtain data
        y_ = y[:, variableIndex].ravel()

        # assume here that you will have train/test indexes instead
        # it's also easier for the plots, as we do not face the issue
        # of duplicate values (e.g. same value with two indexes)
        rs = ShuffleSplit(n_splits=numberOfSplits, random_state=42)
        #rs = LeaveOneOut()

        # initialize performance dictionary of arrays
        performances[variableY] = dict()
        for regressor, regressorName in regressorsList:
            performances[variableY][regressorName] = dict()
            performances[variableY][regressorName]["r^2"] = []
            performances[variableY][regressorName]["e.v"] = []
            performances[variableY][regressorName]["mse"] = []
            performances[variableY][regressorName]["mae"] = []
            performances[variableY][regressorName]["predicted"] = []

        # this is used to store all values of each fold, in order; maybe there's a smarter way to do it
        foldPointsInOrder = []

        # and now, for every regressor
        for foldIndex, indexes in enumerate(rs.split(X)):

            train_index, test_index = indexes

            X_train = X[train_index]
            y_train = y_[train_index]
            X_test = X[test_index]
            y_test = y_[test_index]

            # normalize
            logging.info("Normalizing data...")
            scalerX = StandardScaler()
            scalerY = StandardScaler()

            X_train = scalerX.fit_transform(X_train)
            X_test = scalerX.transform(X_test)

            y_train = scalerY.fit_transform(y_train.reshape(-1, 1)).ravel(
            )  # this "reshape/ravel" here is just to avoid warnings, it has no true effect on data
            y_test = scalerY.transform(y_test.reshape(-1, 1)).ravel()

            # now, we store points of the folder in order of how they appear
            foldPointsInOrder.extend(list(scalerY.inverse_transform(y_test)))

            for regressorIndex, regressorData in enumerate(regressorsList):

                regressor = regressorData[0]
                regressorName = regressorData[1]

                logging.info("Fold #%d/%d: training regressor #%d/%d \"%s\"" %
                             (foldIndex + 1, numberOfSplits, regressorIndex +
                              1, len(regressorsList), regressorName))

                try:
                    regressor.fit(X_train, y_train)

                    y_test_predicted = regressor.predict(X_test)
                    r2Test = r2_score(y_test, y_test_predicted)
                    mseTest = mean_squared_error(y_test, y_test_predicted)
                    maeTest = mean_absolute_error(y_test, y_test_predicted)
                    varianceTest = explained_variance_score(
                        y_test, y_test_predicted)

                    logging.info("R^2 score (test): %.4f" % r2Test)
                    logging.info("EV score (test): %.4f" % varianceTest)
                    logging.info("MSE score (test): %.4f" % mseTest)
                    logging.info("MAE score (test): %.4f" % maeTest)

                    # add performance to the list of performances
                    performances[variableY][regressorName]["r^2"].append(
                        r2Test)
                    performances[variableY][regressorName]["e.v"].append(
                        varianceTest)
                    performances[variableY][regressorName]["mse"].append(
                        mseTest)
                    performances[variableY][regressorName]["mae"].append(
                        maeTest)
                    # also record the predictions, to be used later in a global figure
                    performances[variableY][regressorName]["predicted"].extend(
                        list(scalerY.inverse_transform(y_test_predicted)))

                    try:
                        import matplotlib.pyplot as plt

                        # plotting first figure, with points 'x' and 'o'
                        y_predicted = regressor.predict(scalerX.transform(
                            X))  # 'X' was never wholly rescaled before
                        y_train_predicted = regressor.predict(X_train)

                        plt.figure()

                        plt.scatter(train_index,
                                    y_train,
                                    c="gray",
                                    label="training data")
                        plt.scatter(test_index,
                                    y_test,
                                    c="green",
                                    label="test data")

                        plt.plot(np.arange(len(y_predicted)),
                                 y_predicted,
                                 'x',
                                 c="red",
                                 label="regression")
                        plt.xlabel("order of data samples")
                        plt.ylabel("target")
                        plt.title(regressorName + ", R^2=%.4f (test)" % r2Test)
                        plt.legend()

                        logging.info("Saving figure...")
                        plt.savefig(
                            os.path.join(
                                folderName, regressorName + "-" + variableY +
                                "-fold-" + str(foldIndex + 1) + ".pdf"))
                        plt.close()

                        # plotting second figure, with everything close to a middle line
                        plt.figure()

                        plt.plot(y_train,
                                 y_train_predicted,
                                 'r.',
                                 label="training set")  # points
                        plt.plot(y_test,
                                 y_test_predicted,
                                 'go',
                                 label="test set")  # points
                        plt.plot([
                            min(y_train.min(), y_test.min()),
                            max(y_train.max(), y_test.max())
                        ],
                                 [
                                     min(y_train_predicted.min(),
                                         y_test_predicted.min()),
                                     max(y_train_predicted.max(),
                                         y_test_predicted.max())
                                 ], 'k--')  # line

                        plt.xlabel("measured")
                        plt.ylabel("predicted")
                        plt.title(regressorName + " measured vs predicted, " +
                                  variableY)
                        plt.legend(loc='best')

                        plt.savefig(
                            os.path.join(
                                folderName, regressorName + "-" + variableY +
                                "-fold-" + str(foldIndex + 1) + "-b.pdf"))
                        plt.close()

                        # also, save ordered list of features
                        featuresByImportance = relativeFeatureImportance(
                            regressor)

                        # if list exists, write feature importance to disk
                        # TODO horrible hack here, to avoid issues with GAM
                        if len(featuresByImportance
                               ) > 0 and "GAM" not in regressorName:
                            featureImportanceFileName = regressorName + "-" + variableY + "-featureImportance-fold" + str(
                                foldIndex) + ".csv"
                            with open(
                                    os.path.join(folderName,
                                                 featureImportanceFileName),
                                    "w") as fp:
                                fp.write("feature,importance\n")
                                for featureImportance, featureIndex in featuresByImportance:
                                    fp.write(variablesX[int(featureIndex)] +
                                             "," + str(featureImportance) +
                                             "\n")

                    except ImportError:
                        logging.info(
                            "Cannot import matplotlib. Skipping plots...")

                except Exception as e:
                    logging.info("Regressor \"" + regressorName +
                                 "\" failed on variable \"" + variableY +
                                 "\":" + str(e))

    logging.info("Final summary:")
    with open(os.path.join(folderName, "00_summary.txt"), "w") as fp:

        for variableY in variablesY:

            logging.info("For variable \"" + variableY + "\"")
            fp.write("For variable: " + variableY + " = f(" + variablesX[0])
            for i in range(1, len(variablesX)):
                fp.write("," + variablesX[i])
            fp.write(")\n")

            # create a list from the dictionary and sort it
            sortedPerformances = sorted(
                [(performances[variableY][regressorName], regressorName)
                 for regressorName in performances[variableY]],
                key=lambda x: np.mean(x[0]["r^2"]),
                reverse=True)

            for regressorData in sortedPerformances:
                regressorName = regressorData[1]
                regressorScore = regressorData[0]

                r2Mean = np.mean(regressorScore["r^2"])
                r2std = np.std(regressorScore["r^2"])

                varianceMean = np.mean(regressorScore["e.v"])
                varianceStd = np.std(regressorScore["e.v"])

                mseMean = np.mean(regressorScore["mse"])
                mseStd = np.std(regressorScore["mse"])

                maeMean = np.mean(regressorScore["mae"])
                maeStd = np.std(regressorScore["mae"])

                logging.info(
                    "\t- %s, R^2=%.4f (std=%.4f), Explained Variance=%.4f (std=%.4f), MSE=%.4f (std=%.4f), MAE=%.4f (std=%.4f)"
                    % (regressorName, r2Mean, r2std, varianceMean, varianceStd,
                       mseMean, mseStd, maeMean, maeStd))

                fp.write(
                    "\t- %s, R^2=%.4f (std=%.4f), Explained Variance=%.4f (std=%.4f), MSE=%.4f (std=%.4f), MAE=%.4f (std=%.4f)\n"
                    % (regressorName, r2Mean, r2std, varianceMean, varianceStd,
                       mseMean, mseStd, maeMean, maeStd))

                fp.write("\t\t- R^2:" +
                         str(["%.4f" % x
                              for x in regressorScore["r^2"]]) + "\n")
                fp.write("\t\t- E.V.:" +
                         str(["%.4f" % x
                              for x in regressorScore["e.v"]]) + "\n")
                fp.write("\t\t- MSE:" +
                         str(["%.4f" % x
                              for x in regressorScore["mse"]]) + "\n")
                fp.write("\t\t- MAE:" +
                         str(["%.4f" % x
                              for x in regressorScore["mae"]]) + "\n")

                # also, plot a "global" graph
                # issue here, if a regressor fails, you have incongruent matrixes: a check is in order
                # TODO also, the plot looks really bad if some values are negative; turn everything to absolute values?
                if len(foldPointsInOrder) == len(regressorScore["predicted"]):
                    fig = plt.figure()
                    ax = fig.add_subplot(111)

                    #bottom_left_corner = [min(foldPointsInOrder), max(foldPointsInOrder)]
                    #top_right_corner = [min(regressorScore["predicted"]), max(regressorScore["predicted"])]
                    x_bottom_top = [0, max(foldPointsInOrder)]
                    y_bottom_top = [0, max(foldPointsInOrder)]

                    ax.plot(foldPointsInOrder, regressorScore["predicted"],
                            'g.')  # points
                    ax.plot(x_bottom_top, y_bottom_top, 'k--',
                            label="1:1")  # line
                    ax.plot(x_bottom_top,
                            [y_bottom_top[0] * 1.20, y_bottom_top[1] * 1.20],
                            'r--',
                            label="20% error")
                    ax.plot(x_bottom_top,
                            [y_bottom_top[0] * 0.80, y_bottom_top[1] * 0.80],
                            'r--')

                    ax.set_title(regressorName + " measured vs predicted, " +
                                 variableY + " (all test)")
                    ax.set_xlabel("measured")
                    ax.set_ylabel("predicted")
                    ax.legend(loc='best')

                    plt.savefig(
                        os.path.join(
                            folderName,
                            regressorName + "-" + variableY + "-global-b.png"))
                    plt.close(fig)
parameters = ['teff', 'logg', 'feh']

names = [
    'KNeighbours',
    'Radius Neighbors',
    'Random Forest',
    'Linear Regression',
    'Gaussian Process',
    'Ada Boost',
    'Huber',
    'RANSAC',
    'Theil-Sen',
]
classifiers = [
    KNeighborsRegressor(),
    RadiusNeighborsRegressor(),
    RandomForestRegressor(),
    LinearRegression(),
    GaussianProcessRegressor(),
    AdaBoostRegressor(),
    HuberRegressor(),
    RANSACRegressor(),
    TheilSenRegressor()
]

for parameter in parameters:
    print(parameter)
    y_train = train[parameter].tolist()
    y_test = test[parameter].tolist()

    ends = [sp.amin(y_test), sp.amax(y_test)]
Пример #28
0
x_train, x_test, y_train, y_test = train_test_split(x, y)

from sklearn.neighbors import KNeighborsRegressor

KNN_reg = KNeighborsRegressor(n_neighbors=6, weights='uniform')

KNN_reg.fit(x_train, y_train)

y_predict_knn = KNN_reg.predict(x_test)

y_predict_knn[0:10]

from sklearn.neighbors import RadiusNeighborsRegressor

RNN_reg = RadiusNeighborsRegressor(radius=x_train.std())

RNN_reg.fit(x_train, y_train)

y_predict_rnn = RNN_reg.predict(x_test)

y_predict_rnn[0:10]

RNN_reg = RadiusNeighborsRegressor()

RNN_reg.fit(x_train, y_train)

RNN_reg.predict(x_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error
Пример #29
0
def GetAllModelsForComparison(X_train, Y_train):
    models = {
        'ARDRegression': ARDRegression(),
        'BayesianRidge': BayesianRidge(),
        'ElasticNet': ElasticNet(),
        'ElasticNetCV': ElasticNetCV(),
        'Hinge': Hinge(),
        #'Huber': Huber(),
        'HuberRegressor': HuberRegressor(),
        'Lars': Lars(),
        'LarsCV': LarsCV(),
        'Lasso': Lasso(),
        'LassoCV': LassoCV(),
        'LassoLars': LassoLars(),
        'LassoLarsCV': LassoLarsCV(),
        'LinearRegression': LinearRegression(),
        'Log': Log(),
        'LogisticRegression': LogisticRegression(),
        'LogisticRegressionCV': LogisticRegressionCV(),
        'ModifiedHuber': ModifiedHuber(),
        'MultiTaskElasticNet': MultiTaskElasticNet(),
        'MultiTaskElasticNetCV': MultiTaskElasticNetCV(),
        'MultiTaskLasso': MultiTaskLasso(),
        'MultiTaskLassoCV': MultiTaskLassoCV(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'OrthogonalMatchingPursuitCV': OrthogonalMatchingPursuitCV(),
        'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
        'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),
        'Perceptron': Perceptron(),
        'RANSACRegressor': RANSACRegressor(),
        #'RandomizedLasso': RandomizedLasso(),
        #'RandomizedLogisticRegression': RandomizedLogisticRegression(),
        'Ridge': Ridge(),
        'RidgeCV': RidgeCV(),
        'RidgeClassifier': RidgeClassifier(),
        'SGDClassifier': SGDClassifier(),
        'SGDRegressor': SGDRegressor(),
        'SquaredLoss': SquaredLoss(),
        'TheilSenRegressor': TheilSenRegressor(),
        'BaseEstimator': BaseEstimator(),
        'ClassifierMixin': ClassifierMixin(),
        'LinearClassifierMixin': LinearClassifierMixin(),
        'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
        'StandardScaler': StandardScaler(),
        'TransformerMixin': TransformerMixin(),
        'BaseEstimator': BaseEstimator(),
        'KernelRidge': KernelRidge(),
        'RegressorMixin': RegressorMixin(),
        'LinearSVC': LinearSVC(),
        'LinearSVR': LinearSVR(),
        'NuSVC': NuSVC(),
        'NuSVR': NuSVR(),
        'OneClassSVM': OneClassSVM(),
        'SVC': SVC(),
        'SVR': SVR(),
        'SGDClassifier': SGDClassifier(),
        'SGDRegressor': SGDRegressor(),
        #'BallTree': BallTree(),
        #'DistanceMetric': DistanceMetric(),
        #'KDTree': KDTree(),
        'KNeighborsClassifier': KNeighborsClassifier(),
        'KNeighborsRegressor': KNeighborsRegressor(),
        'KernelDensity': KernelDensity(),
        #'LSHForest': LSHForest(),
        'LocalOutlierFactor': LocalOutlierFactor(),
        'NearestCentroid': NearestCentroid(),
        'NearestNeighbors': NearestNeighbors(),
        'RadiusNeighborsClassifier': RadiusNeighborsClassifier(),
        'RadiusNeighborsRegressor': RadiusNeighborsRegressor(),
        #'GaussianProcess': GaussianProcess(),
        'GaussianProcessRegressor': GaussianProcessRegressor(),
        'GaussianProcessClassifier': GaussianProcessClassifier(),
        'CCA': CCA(),
        'PLSCanonical': PLSCanonical(),
        'PLSRegression': PLSRegression(),
        'PLSSVD': PLSSVD(),
        #'ABCMeta': ABCMeta(),
        #'BaseDiscreteNB': BaseDiscreteNB(),
        'BaseEstimator': BaseEstimator(),
        #'BaseNB': BaseNB(),
        'BernoulliNB': BernoulliNB(),
        'ClassifierMixin': ClassifierMixin(),
        'GaussianNB': GaussianNB(),
        'LabelBinarizer': LabelBinarizer(),
        'MultinomialNB': MultinomialNB(),
        'DecisionTreeClassifier': DecisionTreeClassifier(),
        'DecisionTreeRegressor': DecisionTreeRegressor(),
        'ExtraTreeClassifier': ExtraTreeClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'BaggingClassifier': BaggingClassifier(),
        'BaggingRegressor': BaggingRegressor(),
        #'BaseEnsemble': BaseEnsemble(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'IsolationForest': IsolationForest(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RandomForestRegressor': RandomForestRegressor(),
        'RandomTreesEmbedding': RandomTreesEmbedding(),
        #'VotingClassifier': VotingClassifier(),
        'BaseEstimator': BaseEstimator(),
        'ClassifierMixin': ClassifierMixin(),
        'LabelBinarizer': LabelBinarizer(),
        'MetaEstimatorMixin': MetaEstimatorMixin(),
        #'OneVsOneClassifier': OneVsOneClassifier(),
        #'OneVsRestClassifier': OneVsRestClassifier(),
        #'OutputCodeClassifier': OutputCodeClassifier(),
        'Parallel': Parallel(),
        #'ABCMeta': ABCMeta(),
        'BaseEstimator': BaseEstimator(),
        #'ClassifierChain': ClassifierChain(),
        'ClassifierMixin': ClassifierMixin(),
        'MetaEstimatorMixin': MetaEstimatorMixin(),
        #'MultiOutputClassifier': MultiOutputClassifier(),
        #'MultiOutputEstimator': MultiOutputEstimator(),
        #'MultiOutputRegressor': MultiOutputRegressor(),
        'Parallel': Parallel(),
        'RegressorMixin': RegressorMixin(),
        'LabelPropagation': LabelPropagation(),
        'LabelSpreading': LabelSpreading(),
        'BaseEstimator': BaseEstimator(),
        'IsotonicRegression': IsotonicRegression(),
        'RegressorMixin': RegressorMixin(),
        'TransformerMixin': TransformerMixin(),
        'BernoulliRBM': BernoulliRBM(),
        'MLPClassifier': MLPClassifier(),
        'MLPRegressor': MLPRegressor()
    }
    return models
Пример #30
0
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

# sklearn NO random forest KAIKI
lr = LinearRegression()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
rte = RandomTreesEmbedding()
mr = MLPRegressor(max_iter=1000)
omp = OrthogonalMatchingPursuit()
rr = RANSACRegressor()
tsr = TheilSenRegressor()
br = BayesianRidge(n_iter=300, tol=0.001)
bgm = BayesianGaussianMixture()
knr = KNeighborsRegressor(n_neighbors=5)
rnr = RadiusNeighborsRegressor(radius=1.0)
pr = PLSRegression()
gnb = GaussianNB()
mnb = MultinomialNB()
# estimators = {'LR ':lr,'DTR':dtr,'RFR':rfr,'MR ':mr}
# estimators = {'LR ':lr,'DTR':dtr,'RFR':rfr,'OMP':omp,'RR ':rr, 'BR ':br,'BGM':bgm ,'KNR':knr,'RNR':rnr,'PR ':pr}
estimators = {
    'LR ': lr,
    'DTR': dtr,
    'RFR': rfr,
    'OMP': omp,
    'RR ': rr,
    'BR ': br,
    'KNR': knr,
    'PR ': pr
}