def resolution_estimate(raw_data, n_spectra=25):
    slopes = []
    intercepts = []
    for i in range(n_spectra):
        mzs, intensities = read_random_spectrum(raw_data)
        peak_positions = np.array(gradient(mzs, intensities)[-1])
        intensities_at_peaks = intensities[peak_positions]
        high_intensity_threshold = np.percentile(intensities_at_peaks, 40)
        peak_positions = peak_positions[intensities[peak_positions] > high_intensity_threshold]
        resolutions = []
        for i, peak_pos in enumerate(peak_positions):
            resolutions.append(resolution_at_peak(peak_pos, mzs, intensities))
        resolutions = np.array(resolutions)
        mzs = mzs[peak_positions]
        mzs = mzs[resolutions > 0]
        resolutions = resolutions[resolutions > 0]
        ransac = RANSACRegressor()
        ransac.fit(np.log(mzs).reshape((-1,1)), np.log(resolutions).reshape((-1,1)))
        slope = ransac.estimator_.coef_[0][0]
        intercept = ransac.estimator_.intercept_[0]
        slopes.append(slope)
        intercepts.append(intercept)
    slope = np.median(slopes)
    intercept = np.median(intercepts)
    return lambda mz: np.exp(intercept + slope * np.log(mz)) 
示例#2
0
def ransac_fit(X, y):
    '''
    一个强健的fit
    :return:
    '''
    from sklearn.linear_model import RANSACRegressor
    ransac = RANSACRegressor(LinearRegression(),
                             max_trials=100,
                             min_samples=50,
                             residual_metric=lambda x: np.sum(np.abs(x), axis=1),
                             residual_threshold=5.0,
                             random_state=0)
    ransac.fit(X, y)
    # 输出斜率|截距等数据
    print('Slope: %.3f' % ransac.estimator_.coef_[0])
    print('Intercept: %.3f' % ransac.estimator_.intercept_)
    # plot
    inlier_mask = ransac.inlier_mask_
    outlier_mask = np.logical_not(inlier_mask)
    line_X = np.arange(3, 10, 1)
    line_y_ransac = ransac.predict(line_X[:, np.newaxis])
    plt.scatter(X[inlier_mask], y[inlier_mask],
                c='blue', marker='o', label='Inliers')
    plt.scatter(X[outlier_mask], y[outlier_mask],
                c='lightgreen', marker='s', label='Outliers')
    plt.plot(line_X, line_y_ransac, color='red')
    plt.xlabel('Average number of rooms [RM]')
    plt.ylabel('Price in $1000\'s [MEDV]')
    plt.legend(loc='upper left')
    plt.show()
def train_RANSACRegressionModel(
    X,
    y,
    base_estimator=None,
    min_samples=None,
    residual_threshold=None,
    is_data_valid=None,
    is_model_valid=None,
    max_trials=100,
    stop_n_inliers=inf,
    stop_score=inf,
    stop_probability=0.99,
    residual_metric=None,
    random_state=None,
):
    """
    Train a RANSAC regression model
    """
    model = RANSACRegressor(
        base_estimator=base_estimator,
        min_samples=min_samples,
        residual_threshold=residual_threshold,
        is_data_valid=is_data_valid,
        is_model_valid=is_model_valid,
        max_trials=max_trials,
        stop_n_inliers=stop_n_inliers,
        stop_score=stop_score,
        stop_probability=stop_probability,
        residual_metric=residual_metric,
        random_state=random_state,
    )
    model = model.fit(X, y)
    return model
def identify_linear_outliers(pts, win_size=7):
    # this runs a sliding window across the trace, performing a RANSAC regression
    # for each window. A point is considered an outlier if the moving-RANSAC
    # never considers it an inlier.
    regressor = RANSACRegressor()
    x = np.arange(win_size, dtype=np.float64)
    x = np.expand_dims(x, axis=1)
    inlier_count = np.zeros_like(pts)
    npts = len(pts)
    for i in range(npts-win_size+1):
        y = pts[i:i+win_size]

        # RANSAC of this section of the trace
        try:
            regressor.fit(x, y)
            inlier_inds = regressor.inlier_mask_
        except ValueError:  # no consensus -- (almost) all the points were bad
            inlier_inds = []

        # accumulate the number of times each point was an inlier
        for j, inlier in enumerate(inlier_inds):
            if inlier:
                inlier_count[i+j] += 1

    # Note: the following line will always consider the first and last points outliers!
    #       However, I don't think this will matter for downstream analysis.  -BK
    outlier_mask = np.logical_or(inlier_count < 2, pts == 0)
#    outlier_inds = np.where(outlier_mask)[0]
#
#    # points that are exactly zero are always considered outliers
#    outlier_inds = np.append(outlier_inds, np.where(pts==0)[0])
    return outlier_mask
    def get_outliers_by_ransac(self, table, column_indexes):
        '''
        Get outliers using RANSAC regression, which deals better with large outliers in the y direction, 
        and faster than Huber when the number of samples is very large. 
        RANSAC outpus perfect precision (100%) but far from perfect recall (could be 50% - 60%) in our experiments. 
        '''
        X = table[ :, column_indexes[ :-1]].astype(float)
        X = utils.enforce_columns(X)
        y = table[ :, column_indexes[-1]].astype(float)

        # preprocessing doesn't make any difference for RANSAC in our experiments
        #x = preprocessing.minmax_scale(x)
        #y = preprocessing.minmax_scale(y)

        model_ransac = RANSACRegressor(LinearRegression())
        model_ransac.fit(X, y)

        inlier_mask = model_ransac.inlier_mask_
        outlier_mask = np.logical_not(inlier_mask)
        outliers = [idx for idx, val in enumerate(outlier_mask) if val]

        residuals = abs(model_ransac.predict(X) - y)
        confidences = preprocessing.minmax_scale(residuals[outliers])*0.09+0.9

        return (outliers, confidences)
示例#6
0
def test_ransac_stop_n_inliers():
    base_estimator = LinearRegression()
    ransac_estimator = RANSACRegressor(base_estimator, min_samples=2,
                                       residual_threshold=5, stop_n_inliers=2,
                                       random_state=0)
    ransac_estimator.fit(X, y)

    assert_equal(ransac_estimator.n_trials_, 1)
示例#7
0
def test_ransac_sparse_csc():
    X_sparse = sparse.csc_matrix(X)

    base_estimator = LinearRegression()
    ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0)
    ransac_estimator.fit(X_sparse, y)

    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
    ref_inlier_mask[outliers] = False

    assert_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
示例#8
0
def test_ransac_predict():
    X = np.arange(100)[:, None]
    y = np.zeros((100,))
    y[0] = 1
    y[1] = 100

    base_estimator = LinearRegression()
    ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=0.5, random_state=0)
    ransac_estimator.fit(X, y)

    assert_equal(ransac_estimator.predict(X), np.zeros(100))
示例#9
0
def test_ransac_score():
    X = np.arange(100)[:, None]
    y = np.zeros((100,))
    y[0] = 1
    y[1] = 100

    base_estimator = LinearRegression()
    ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=0.5, random_state=0)
    ransac_estimator.fit(X, y)

    assert_equal(ransac_estimator.score(X[2:], y[2:]), 1)
    assert_less(ransac_estimator.score(X[:2], y[:2]), 1)
示例#10
0
def test_ransac_default_residual_threshold():
    base_estimator = LinearRegression()
    ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, random_state=0)

    # Estimate parameters of corrupted data
    ransac_estimator.fit(X, y)

    # Ground truth / reference inlier mask
    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
    ref_inlier_mask[outliers] = False

    assert_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
示例#11
0
def fit_plane(points):
    '''
    fit a plane through a list of 3d points and return a, b, c, d that represents the plane as ax+by+cz+d=0
    '''
    X = [[p[0], p[1]] for p in points]
    X = np.matrix(X)
    y = [p[2] for p in points]
    model = RANSACRegressor(LinearRegression())
    model.fit(X, y)
    d = list(model.estimator_.intercept_.flatten())[0]
    a, b = list(model.estimator_.coef_.flatten())
    c = -1
    return a, b, c, d
示例#12
0
def test_ransac_max_trials():
    base_estimator = LinearRegression()

    ransac_estimator = RANSACRegressor(base_estimator, min_samples=2,
                                       residual_threshold=5, max_trials=0,
                                       random_state=0)
    assert_raises(ValueError, ransac_estimator.fit, X, y)

    ransac_estimator = RANSACRegressor(base_estimator, min_samples=2,
                                       residual_threshold=5, max_trials=11,
                                       random_state=0)
    assert getattr(ransac_estimator, 'n_trials_', None) is None
    ransac_estimator.fit(X, y)
    assert_equal(ransac_estimator.n_trials_, 2)
def regression_information(dem, bilinear_interpolation_results):
    dem_shape = dem.shape
    # print dem_shape
    dem = dem.flatten()
    bilinear_interpolation_results = bilinear_interpolation_results.flatten()
    alt_data = np.column_stack((dem, bilinear_interpolation_results))
    alt_data = alt_data[np.where(alt_data[:, 0] > 0)]
    RANSAC_lr = RANSACRegressor(LinearRegression())
    RANSAC_lr.fit(alt_data[:, 0:1], alt_data[:, 1])
    predict_result = RANSAC_lr.predict(alt_data[:, 0:1]).transpose()[0]
    # print predict_result
    # print predict_result.shape
    residual = bilinear_interpolation_results - predict_result
    residual = np.reshape(residual, dem_shape)
    return RANSAC_lr, residual
示例#14
0
def test_ransac_multi_dimensional_targets():

    base_estimator = LinearRegression()
    ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0)

    # 3-D target values
    yyy = np.column_stack([y, y, y])

    # Estimate parameters of corrupted data
    ransac_estimator.fit(X, yyy)

    # Ground truth / reference inlier mask
    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
    ref_inlier_mask[outliers] = False

    assert_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
    def _ransac_regression(pts, regressor):
        ransac = RANSACRegressor(regressor)
        x = np.array([a['peak_size'] for a in pts])
        y = np.array([b['relative_peak_height'] for b in pts])
        X = x[:, np.newaxis]

        ransac.fit(X, y)
        inlier_mask = ransac.inlier_mask_
        ransac_mse = mean_squared_error(y[inlier_mask], ransac.predict(X[inlier_mask])) ** .5
        ransac_r2 = r2_score(y[inlier_mask], ransac.predict(X[inlier_mask]))

        return {
            'intercept': ransac.estimator_.intercept_,
            'r_squared': ransac_r2,
            'slope': ransac.estimator_.coef_[0],
            'sd': ransac_mse
        }
示例#16
0
def test_ransac_none_estimator():

    base_estimator = LinearRegression()

    ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0)
    ransac_none_estimator = RANSACRegressor(None, 2, 5, random_state=0)

    ransac_estimator.fit(X, y)
    ransac_none_estimator.fit(X, y)

    assert_array_almost_equal(ransac_estimator.predict(X), ransac_none_estimator.predict(X))
示例#17
0
def test_ransac_fit_sample_weight():
    ransac_estimator = RANSACRegressor(random_state=0)
    n_samples = y.shape[0]
    weights = np.ones(n_samples)
    ransac_estimator.fit(X, y, weights)
    # sanity check
    assert_equal(ransac_estimator.inlier_mask_.shape[0], n_samples)

    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_
                                   ).astype(np.bool_)
    ref_inlier_mask[outliers] = False
    # check that mask is correct
    assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)

    # check that fit(X)  = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where
    #   X = X1 repeated n1 times, X2 repeated n2 times and so forth
    random_state = check_random_state(0)
    X_ = random_state.randint(0, 200, [10, 1])
    y_ = np.ndarray.flatten(0.2 * X_ + 2)
    sample_weight = random_state.randint(0, 10, 10)
    outlier_X = random_state.randint(0, 1000, [1, 1])
    outlier_weight = random_state.randint(0, 10, 1)
    outlier_y = random_state.randint(-1000, 0, 1)

    X_flat = np.append(np.repeat(X_, sample_weight, axis=0),
                       np.repeat(outlier_X, outlier_weight, axis=0), axis=0)
    y_flat = np.ndarray.flatten(np.append(np.repeat(y_, sample_weight, axis=0),
                                np.repeat(outlier_y, outlier_weight, axis=0),
                                          axis=0))
    ransac_estimator.fit(X_flat, y_flat)
    ref_coef_ = ransac_estimator.estimator_.coef_

    sample_weight = np.append(sample_weight, outlier_weight)
    X_ = np.append(X_, outlier_X, axis=0)
    y_ = np.append(y_, outlier_y)
    ransac_estimator.fit(X_, y_, sample_weight)

    assert_almost_equal(ransac_estimator.estimator_.coef_, ref_coef_)

    # check that if base_estimator.fit doesn't support
    # sample_weight, raises error
    base_estimator = Lasso()
    ransac_estimator = RANSACRegressor(base_estimator)
    assert_raises(ValueError, ransac_estimator.fit, X, y, weights)
示例#18
0
文件: sk.py 项目: SingMao/OneRobot
    def fit(self, angs, pts):
        print(angs.shape)
        print(pts.shape)

        model1 = RANSACRegressor(LinearRegression())
        model2 = RANSACRegressor(LinearRegression())
        model1.fit(angs[:,[0]], pts[:,0])
        model2.fit(angs[:,[2]], pts[:,1])

        self.m1, self.b1 = float(model1.estimator_.coef_), model1.estimator_.intercept_
        self.m2, self.b2 = float(model2.estimator_.coef_), model2.estimator_.intercept_
        print('Coefficients :')
        print(self.m1, self.b1, self.m2, self.b2)
示例#19
0
def test_ransac_max_trials():
    base_estimator = LinearRegression()

    ransac_estimator = RANSACRegressor(base_estimator, min_samples=2,
                                       residual_threshold=5, max_trials=0,
                                       random_state=0)
    assert_raises(ValueError, ransac_estimator.fit, X, y)

    # there is a 1e-9 chance it will take these many trials. No good reason
    # 1e-2 isn't enough, can still happen
    # 2 is the what ransac defines  as min_samples = X.shape[1] + 1
    max_trials = _dynamic_max_trials(
        len(X) - len(outliers), X.shape[0], 2, 1 - 1e-9)
    ransac_estimator = RANSACRegressor(base_estimator, min_samples=2)
    for i in range(50):
        ransac_estimator.set_params(min_samples=2, random_state=i)
        ransac_estimator.fit(X, y)
        assert_less(ransac_estimator.n_trials_, max_trials + 1)
pd.DataFrame(data).describe()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.metrics import mean_squared_error

x_tr, x_te, y_tr, y_te = train_test_split(data, target, test_size=0.2)

lr = LinearRegression()
lr.fit(x_tr, y_tr)

mean_squared_error(lr.predict(x_tr), y_tr)

mean_squared_error(lr.predict(x_te), y_te)

Rr = RANSACRegressor()
Rr.fit(x_tr, y_tr)

mean_squared_error(Rr.predict(x_tr), y_tr)

mean_squared_error(Rr.predict(x_te), y_te)

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

dr = DecisionTreeRegressor(max_features='sqrt')
dr.fit(x_tr, y_tr)

mean_squared_error(dr.predict(x_tr), y_tr)

mean_squared_error(dr.predict(x_te), y_te)
示例#21
0
def test_ransac_residual_loss():
    loss_multi1 = lambda y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1)
    loss_multi2 = lambda y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1)

    loss_mono = lambda y_true, y_pred : np.abs(y_true - y_pred)
    yyy = np.column_stack([y, y, y])

    base_estimator = LinearRegression()
    ransac_estimator0 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0)
    ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0,
                                        loss=loss_multi1)
    ransac_estimator2 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0,
                                        loss=loss_multi2)

    # multi-dimensional
    ransac_estimator0.fit(X, yyy)
    ransac_estimator1.fit(X, yyy)
    ransac_estimator2.fit(X, yyy)
    assert_array_almost_equal(ransac_estimator0.predict(X),
                              ransac_estimator1.predict(X))
    assert_array_almost_equal(ransac_estimator0.predict(X),
                              ransac_estimator2.predict(X))

    # one-dimensional
    ransac_estimator0.fit(X, y)
    ransac_estimator2.loss = loss_mono
    ransac_estimator2.fit(X, y)
    assert_array_almost_equal(ransac_estimator0.predict(X),
                              ransac_estimator2.predict(X))
    ransac_estimator3 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0,
                                        loss="squared_loss")
    ransac_estimator3.fit(X, y)
    assert_array_almost_equal(ransac_estimator0.predict(X),
                              ransac_estimator2.predict(X))
X = df[['RM']].values
y = df['MEDV'].values
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()
X_std = sc_x.fit_transform(X)
y_std = sc_y.fit_transform(y)

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
import numpy as np

ransac = RANSACRegressor(LinearRegression(),
                         max_trials=100,
                         min_samples=50,
                         residual_metric=lambda x: np.sum(np.abs(x), axis=1),
                         residual_threshold=5.0,
                         random_state=0)

ransac.fit(X, y)

inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)
line_X = np.arange(3, 10, 1)
line_y_ransac = ransac.predict(line_X[:, np.newaxis])

plt.scatter(X[inlier_mask],
            y[inlier_mask],
            c='blue',
            marker='o',
            label='Inliers')
示例#23
0
def test_ransac_residual_loss():
    loss_multi1 = lambda y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1)
    loss_multi2 = lambda y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1)

    loss_mono = lambda y_true, y_pred : np.abs(y_true - y_pred)
    yyy = np.column_stack([y, y, y])

    base_estimator = LinearRegression()
    ransac_estimator0 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0)
    ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0,
                                        loss=loss_multi1)
    ransac_estimator2 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0,
                                        loss=loss_multi2)

    # multi-dimensional
    ransac_estimator0.fit(X, yyy)
    ransac_estimator1.fit(X, yyy)
    ransac_estimator2.fit(X, yyy)
    assert_array_almost_equal(ransac_estimator0.predict(X),
                              ransac_estimator1.predict(X))
    assert_array_almost_equal(ransac_estimator0.predict(X),
                              ransac_estimator2.predict(X))

    # one-dimensional
    ransac_estimator0.fit(X, y)
    ransac_estimator2.loss = loss_mono
    ransac_estimator2.fit(X, y)
    assert_array_almost_equal(ransac_estimator0.predict(X),
                              ransac_estimator2.predict(X))
    ransac_estimator3 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0,
                                        loss="squared_loss")
    ransac_estimator3.fit(X, y)
    assert_array_almost_equal(ransac_estimator0.predict(X),
                              ransac_estimator2.predict(X))
示例#24
0
y_test = np.sin(X_test)
X_test = X_test[:, np.newaxis]

y_errors = y.copy()
y_errors[::3] = 3
X_errors = X.copy()
X_errors[::3] = 3

y_errors_large = y.copy()
y_errors_large[::3] = 10
X_errors_large = X.copy()
X_errors_large[::3] = 10

estimators = [('OLS', LinearRegression()),
              ('Theil-Sen', TheilSenRegressor(random_state=42)),
              ('RANSAC', RANSACRegressor(random_state=42)),
              ('HuberRegressor', HuberRegressor())]
colors = {
    'OLS': 'turquoise',
    'Theil-Sen': 'gold',
    'RANSAC': 'lightgreen',
    'HuberRegressor': 'black'
}
linestyle = {
    'OLS': '-',
    'Theil-Sen': '-.',
    'RANSAC': '--',
    'HuberRegressor': '--'
}
lw = 3
示例#25
0
 def ransac_regressor(self):
     x_train, x_test, y_train, y_test = self.preprocessing()
     model = RANSACRegressor()
     y_pred = model.fit(x_train, y_train).predict(x_test)
     self.printing(y_test, y_pred, 'RANSAC')
示例#26
0
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import RANSACRegressor
from boston_dataset import BostonDataset

boston = BostonDataset()
RANSAC = RANSACRegressor()

df = boston.df

# Feature matrix, target vector
X = df['RM'].values.reshape(-1, 1)
y = df['MEDV'].values

# Fit model
RANSAC.fit(X, y)

# [boolean]
inlier_mask = RANSAC.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

line_X = np.arange(3, 10, 1)
# [3, 4, 5, 6, 7, 8, 9] rooms

line_y_ransac = RANSAC.predict(line_X.reshape(-1, 1))


def test():
    # print(inlier_mask)
示例#27
0
t_pca = [t0_pca, t1_pca]

min_curvature = np.empty([2, 3])
max_curvature = np.empty([2, 3])
saddle = np.empty([2, 3])

for objIndex in [0, 1]:
    print "Time: {}".format(timer() - start)
    print "Fit polynomial"

    if regression_method.lower() in TS_SPECIFIERS:
        model = Pipeline([('poly', PolynomialFeatures(degree=order)),
                          ('regr', TheilSenRegressor())])
    elif regression_method.lower() in RANSAC_SPECIFIERS:
        model = Pipeline([('poly', PolynomialFeatures(degree=order)),
                          ('regr', RANSACRegressor())])
    else:
        model = Pipeline([('poly', PolynomialFeatures(degree=order)),
                          ('regr', LinearRegression(fit_intercept=False))])

    # z as a response to x, y (coords in pca, z-3rd component)
    model = model.fit(p_pca[objIndex][:, :2], p_pca[objIndex][:, 2])

    # coefficients of the polynomial
    if regression_method.lower() in RANSAC_SPECIFIERS:
        C = model.named_steps['regr'].estimator_.coef_
    else:
        C = model.named_steps['regr'].coef_

    print "Coefficients: "
    print C
示例#28
0
            可以用在非线性或线性数据集上,具体算法过程看官网吧
        2、TheilSen,广义中位数评估
            适合小数据,适合处理在X上的中等异常,如果特征数增加到一定程度时并没有好过最小二乘
            能接受最大X的29%被破坏
        3、Huber,大于线性损失的被认为是异常值的样本
            如果样本数>>特征数,则最快
        
        如果啥都不懂就用RANSAC....
'''

rg_1 = RANSACRegressor(base_estimator=None,
                       min_samples=None,
                       residual_threshold=None,
                       is_data_valid=None,
                       is_model_valid=None,
                       max_trials=100,
                       stop_n_inliers=inf,
                       stop_score=inf,
                       stop_probability=0.99,
                       residual_metric=None,
                       loss='absolute_loss',
                       random_state=None)
rg_2 = TheilSenRegressor(fit_intercept=True,
                         copy_X=True,
                         max_subpopulation=10000.0,
                         n_subsamples=None,
                         max_iter=300,
                         tol=0.001,
                         random_state=None,
                         n_jobs=1,
                         verbose=False)
rg_3 = HuberRegressor(epsilon=1.35,
示例#29
0
    def get_algorithm(self):
        '''
        Inputs:
            algorithm (string)  - Name of the regressor to run.  Follows Sklearn naming conventions.
                                    Available keys: ARDRegression | AdaBoostRegressor | BaggingRegressor | BayesianRidge | CCA
                                                    DecisionTreeRegressor | ElasticNet | ExtraTreeRegressor
                                                    ExtraTreesRegressor | GaussianProcessRegressor | GradientBoostingRegressor
                                                    HuberRegressor | KNeighborsRegressor | KernelRidge | Lars | Lasso
                                                    LassoLars | LinearRegression | LinearSVR | MLPRegressor | NuSVR | 
                                                    OrthogonalMatchingPursuit | PLSCanonical | PLSRegression | 
                                                    PassiveAggressiveRegressor | RANSACRegressor | RandomForestRegressor | 
                                                    Ridge | SGDRegressor | SVR | TheilSenRegressor | TransformedTargetRegressor

                                    Currently not supporting: ElasticNetCV | LarsCV | LassoCV | LassoLarsCV | LassoLarsIC | 
                                                    MultiTaskElasticNet | MultiTaskElasticNetCV | MultiTaskLasso | MultiTaskLassoCV |
                                                    OrthogonalMatchingPursuitCV | RidgeCV | RadiusNeighborsRegressor
        Outputs:

        Notes:
            Scoring Metrics: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
        '''
        if (self.algorithmName == "ARDRegression"): algorithm = ARDRegression()
        elif (self.algorithmName == "AdaBoostRegressor"):
            algorithm = AdaBoostRegressor()
        elif (self.algorithmName == "BaggingRegressor"):
            algorithm = BaggingRegressor()
        elif (self.algorithmName == "BayesianRidge"):
            algorithm = BayesianRidge()
        elif (self.algorithmName == "CCA"):
            algorithm = CCA()
        elif (self.algorithmName == "DecisionTreeRegressor"):
            algorithm = DecisionTreeRegressor()
        elif (self.algorithmName == "ElasticNet"):
            algorithm = ElasticNet()
        elif (self.algorithmName == "ExtraTreeRegressor"):
            algorithm = ExtraTreeRegressor()
        elif (self.algorithmName == "ExtraTreesRegressor"):
            algorithm = ExtraTreesRegressor()
        elif (self.algorithmName == "GaussianProcessRegressor"):
            algorithm = GaussianProcessRegressor()
        elif (self.algorithmName == "GradientBoostingRegressor"):
            algorithm = GradientBoostingRegressor()
        elif (self.algorithmName == "HuberRegressor"):
            algorithm = HuberRegressor()
        elif (self.algorithmName == "KNeighborsRegressor"):
            algorithm = KNeighborsRegressor()
        elif (self.algorithmName == "KernelRidge"):
            algorithm = KernelRidge()
        elif (self.algorithmName == "Lars"):
            algorithm = Lars()
        elif (self.algorithmName == "Lasso"):
            algorithm = Lasso()
        elif (self.algorithmName == "LassoLars"):
            algorithm = LassoLars()
        elif (self.algorithmName == "LinearRegression"):
            algorithm = LinearRegression()
        elif (self.algorithmName == "LinearSVR"):
            algorithm = LinearSVR()
        elif (self.algorithmName == "MLPRegressor"):
            algorithm = MLPRegressor()
        elif (self.algorithmName == "NuSVR"):
            algorithm = NuSVR()
        elif (self.algorithmName == "OrthogonalMatchingPursuit"):
            algorithm = OrthogonalMatchingPursuit()
        elif (self.algorithmName == "PLSCanonical"):
            algorithm = PLSCanonical()
        elif (self.algorithmName == "PLSRegression"):
            algorithm = PLSRegression()
        elif (self.algorithmName == "PassiveAggressiveRegressor"):
            algorithm = PassiveAggressiveRegressor()
        elif (self.algorithmName == "RANSACRegressor"):
            algorithm = RANSACRegressor()
        elif (self.algorithmName == "RandomForestRegressor"):
            algorithm = RandomForestRegressor()
        elif (self.algorithmName == "Ridge"):
            algorithm = Ridge()
        elif (self.algorithmName == "SGDRegressor"):
            algorithm = SGDRegressor()
        elif (self.algorithmName == "SVR"):
            algorithm = SVR()
        elif (self.algorithmName == "TheilSenRegressor"):
            algorithm = TheilSenRegressor()
        elif (self.algorithmName == "TransformedTargetRegressor"):
            algorithm = TransformedTargetRegressor()
        else:
            return None

        return algorithm
示例#30
0
def get_model_from_name(model_name, training_params=None, is_hp_search=False):
    global keras_imported

    # For Keras
    epochs = 1000
    # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning':
    #     print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy')
    #     epochs = 100

    all_model_params = {
        'LogisticRegression': {},
        'RandomForestClassifier': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'ExtraTreesClassifier': {
            'n_jobs': -1
        },
        'AdaBoostClassifier': {},
        'SGDClassifier': {
            'n_jobs': -1
        },
        'Perceptron': {
            'n_jobs': -1
        },
        'LinearSVC': {
            'dual': False
        },
        'LinearRegression': {
            'n_jobs': -2
        },
        'RandomForestRegressor': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'LinearSVR': {
            'dual': False,
            'loss': 'squared_epsilon_insensitive'
        },
        'ExtraTreesRegressor': {
            'n_jobs': -1
        },
        'MiniBatchKMeans': {
            'n_clusters': 8
        },
        'GradientBoostingRegressor': {
            'learning_rate': 0.1,
            'warm_start': True
        },
        'GradientBoostingClassifier': {
            'learning_rate': 0.1,
            'warm_start': True
        },
        'SGDRegressor': {
            'shuffle': False
        },
        'PassiveAggressiveRegressor': {
            'shuffle': False
        },
        'AdaBoostRegressor': {},
        'LGBMRegressor': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'LGBMClassifier': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'DeepLearningRegressor': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'DeepLearningClassifier': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'CatBoostRegressor': {},
        'CatBoostClassifier': {}
    }

    # if os.environ.get('is_test_suite', 0) == 'True':
    #     all_model_params

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if is_hp_search == True:
        if model_name[:12] == 'DeepLearning':
            model_params['epochs'] = 50
        if model_name[:4] == 'LGBM':
            model_params['n_estimators'] = 500

    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it)
        model_params.update(training_params)
        print(
            'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:'
        )
        print(model_params)

    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'LinearSVC': LinearSVC(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'LinearSVR': LinearSVR(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans(),
    }

    try:
        model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001)
        model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
            max_iter=1000, tol=0.001)
        model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor(
            max_iter=1000, tol=0.001)
    except TypeError:
        model_map['SGDClassifier'] = SGDClassifier()
        model_map['Perceptron'] = Perceptron()
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
        )
        model_map['SGDRegressor'] = SGDRegressor()
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor()

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if catboost_installed:
        model_map['CatBoostRegressor'] = CatBoostRegressor(
            calc_feature_importance=True)
        model_map['CatBoostClassifier'] = CatBoostClassifier(
            calc_feature_importance=True)

    if model_name[:12] == 'DeepLearning':
        if keras_imported == False:
            # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead)
            try:
                os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
                os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
                from tensorflow import logging
                logging.set_verbosity(logging.INFO)
            except:
                pass

            global maxnorm
            global Dense, Dropout
            global LeakyReLU, PReLU, ThresholdedReLU, ELU
            global Sequential
            global keras_load_model
            global regularizers, optimizers
            global Activation
            global KerasRegressor, KerasClassifier

            from keras.constraints import maxnorm
            from keras.layers import Activation, Dense, Dropout
            from keras.layers.advanced_activations import LeakyReLU, PReLU, ThresholdedReLU, ELU
            from keras.models import Sequential
            from keras.models import load_model as keras_load_model
            from keras import regularizers, optimizers
            from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
            keras_imported = True

        model_map['DeepLearningClassifier'] = KerasClassifier(
            build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(
            build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print(
            'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize'
        )
        raise (e)

    if os.environ.get('is_test_suite', False) == 'True':
        if 'n_jobs' in model_params:
            model_params['n_jobs'] = 1
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params
示例#31
0
def fit_linreg_robust(x,
                      y,
                      mask=None,
                      intercept=False,
                      r2=True,
                      est_method='rlm'):
    """Apply robust linear regression of y w.r.t x.

    Arguments
    ---------
    x: :class:`~numpy.ndarray` or sparse `csr_matrix`
        A vector of independent variables.
    y: :class:`~numpy.ndarray` or sparse `csr_matrix`
        A vector of dependent variables.
    intercept: bool
        If using steady state assumption for fitting, then:
        True -- the linear regression is performed with an unfixed intercept;
        False -- the linear regresssion is performed with a fixed zero intercept.
    est_method: str (default: `rlm`)
        The linear regression estimation method that will be used.

    Returns
    -------
    k: float
        The estimated slope.
    b: float
        The estimated intercept.
    r2: float
        Coefficient of determination or r square calculated with the extreme data points.
    all_r2: float
        The r2 calculated using all data points.
    """

    x = x.A if issparse(x) else x
    y = y.A if issparse(y) else y

    _mask = np.logical_and(~np.isnan(x), ~np.isnan(y))
    if mask is not None:
        _mask &= mask
    xx = x[_mask]
    yy = y[_mask]

    try:
        if est_method.lower() == 'rlm':
            xx_ = sm.add_constant(xx) if intercept else xx
            res = sm.RLM(yy, xx_).fit()
            k, b = res.params[::-1] if intercept else (res.params[0], 0)
        elif est_method.lower() == 'ransac':
            reg = RANSACRegressor(LinearRegression(fit_intercept=intercept),
                                  random_state=0)
            reg.fit(xx.reshape(-1, 1), yy.reshape(-1, 1))
            k, b = reg.estimator_.coef_[0, 0], (reg.estimator_.intercept_[0]
                                                if intercept else 0)
        else:
            raise ImportError(
                f"estimation method {est_method} is not implemented. "
                f"Currently supported linear regression methods include `rlm` and `ransac`."
            )
    except:
        if intercept:
            ym = np.mean(yy)
            xm = np.mean(xx)

            cov = np.mean(xx * yy) - xm * ym
            var_x = np.mean(xx * xx) - xm * xm
            k = cov / var_x
            b = ym - k * xm
            # # assume b is always positive
            # if b < 0:
            #     k, b = np.mean(xx * yy) / np.mean(xx * xx), 0
        else:
            # use uncentered cov and var_x
            cov = np.mean(xx * yy)
            var_x = np.mean(xx * xx)
            k = cov / var_x
            b = 0

    if r2:
        SS_tot_n, all_SS_tot_n = np.var(yy), np.var(y)
        SS_res_n, all_SS_res_n = (
            np.mean((yy - k * xx - b)**2),
            np.mean((y - k * x - b)**2),
        )
        r2, all_r2 = 1 - SS_res_n / SS_tot_n, 1 - all_SS_res_n / all_SS_tot_n

        return k, b, r2, all_r2
    else:
        return k, b
示例#32
0
def main():
    regression_name=sys.argv[1]
    datapath=sys.argv[2]

    if(datapath=='housing.data.txt'):
        df = pd.read_csv('housing.data.txt',
                 header=None,
                 sep='\s+')

        df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
              'NOX', 'RM', 'AGE', 'DIS', 'RAD',
              'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

        X=df.iloc[:,:-1]
        y=df['MEDV'].values

    else:
        df=pd.read_csv('all_breakdown.csv')
        df=df.fillna(0)
        X=df.iloc[:,1:-1]
        y=df['WIND TOTAL'].values

    y2d=y[ : , np . newaxis ] #change one dimensional array to two dimensions
    sc_x = StandardScaler ( )
    sc_y = StandardScaler ( )
    sc_x.fit (X)
    sc_y.fit (y)
    x_std = sc_x.transform(X)
    y_std = sc_y.transform(y2d).flatten()

    X_train, X_test, y_train, y_test = train_test_split(x_std, y_std, test_size=0.3, random_state=0)
    if (regression_name=="Linear"):
        model = LinearRegression ()
        model.fit (X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        print('Linear Regression')
        print ( 'Slope : %.3f ' %model.coef_[0])
        print ( 'Intercept : %.3f' %model.intercept_)
        print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
        print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)));

    elif (regression_name=="RANSAC"):
        ransac = RANSACRegressor(LinearRegression() , max_trials=100, min_samples=50, loss='absolute_loss' , residual_threshold=5.0, random_state=1)
        ransac. fit (X_train,y_train)
        print('RANSAC Regressor')
        print ( 'Slope : %.3f ' %ransac.estimator_.coef_[0])
        print ( 'Intercept : %.3f' %ransac.estimator_.intercept_)
#        print( 'Score of the prediction: %.3f' %ransac.score(X_test,y_test))
        y_train_pred = ransac.predict(X_train)
        y_test_pred = ransac.predict(X_test)
        print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
        print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)));

    elif (regression_name=="Ridge"):
        ridge = Ridge(alpha=1.0)
        ridge.fit(X_train, y_train)
        y_train_pred = ridge.predict(X_train)
        y_test_pred = ridge.predict(X_test)
        print('Ridge Regularization')
        print ('Slope : %.3f'%ridge.coef_[0])
        print ('Intercept : %.3f' %ridge.intercept_)
        print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
        print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)));

    elif (regression_name=="Lasso"):
        lasso = Lasso(alpha=1.0)
        lasso.fit(X_train, y_train)
        y_train_pred = lasso.predict(X_train)
        y_test_pred = lasso.predict(X_test)
        print('Lasso Regularization')
        print ( 'Slope : %.3f ' %lasso.coef_[0])
        print ( 'Intercept : %.3f' %lasso.intercept_ )

        print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
        print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)));

    elif (regression_name=="Nonlinear"):
        tree = DecisionTreeRegressor(max_depth=3) 
        tree. fit (X_train,y_train)
        y_test_pred = tree . predict (X_test)
        y_train_pred = tree.predict(X_train)
        print('Non linear Regression - Decision Tree Regressor')
        print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
        print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)));

    elif (regression_name=="Normal"):
        if(datapath=='housing.data.txt'):
            onevec = np.ones((X_train.shape[0]) ) #this generates a 1−dimensional array 
            onevec = onevec [ : , np . newaxis ] # changes the 1−dimensional array to 2−dimensional array 
            Xb = np.hstack((onevec,X_train)) # Xb is a 2−dimensional array 
            w = np.zeros(X_train.shape[1])
            z = np.linalg.inv(np.dot(Xb.T,Xb))
            w = np.dot(z, np.dot(Xb.T,y_train))
            print('Normal Equation Solution')
            print('Slope: %.3f' %w[1])
            print ( 'Intercept : %.3f' %w[0]);
            yhat = np.dot(Xb,w.T)
            print('MSE train: %.3f,' %mean_squared_error(y_train, yhat))
        else:
            print('Not Applicable');
    else:
        print ("No regression available with the given name");


    print("--- Time taken is %s seconds ---" % (time.time() - start_time))
示例#33
0
def fit_RANSAC(features_train, labels_train, features_pred):
	model = RANSACRegressor()
	model.fit(features_train, labels_train)
	labels_pred = model.predict(features_pred)
	print "RANSAC - coefficient of determination R^2 of the prediction: ", model.score(features_train, labels_train)
	return labels_pred
示例#34
0
文件: errprob.py 项目: snowdj/LMP

#	

from sklearn.linear_model import TheilSenRegressor
X = np.reshape(star.temp.array,(-1,1))
y = star.light
reg = TheilSenRegressor(random_state=0).fit(X,y)
plt.scatter(star.temp, star.light)
plt.plot(xr, reg.intercept_ + reg.coef_*xr,'k-')


#	

from sklearn.linear_model import  RANSACRegressor
reg = RANSACRegressor().fit(X,y)
i = reg.inlier_mask_
plt.scatter(star.temp[i], star.light[i])
plt.scatter(star.temp[~i], star.light[~i],marker='x')
plt.plot(xr, reg.estimator_.intercept_ + reg.estimator_.coef_*xr,
    'k-')


# ## Exercises

# ## Packages Used

import sys
import matplotlib
import statsmodels as sm
import seaborn as sns
示例#35
0
def test_ransac_residual_metric():
    residual_metric1 = lambda dy: np.sum(np.abs(dy), axis=1)
    residual_metric2 = lambda dy: np.sum(dy ** 2, axis=1)

    yyy = np.column_stack([y, y, y])

    base_estimator = LinearRegression()
    ransac_estimator0 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0)
    ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0,
                                        residual_metric=residual_metric1)
    ransac_estimator2 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0,
                                        residual_metric=residual_metric2)

    # multi-dimensional
    ransac_estimator0.fit(X, yyy)
    ransac_estimator1.fit(X, yyy)
    ransac_estimator2.fit(X, yyy)
    assert_equal(ransac_estimator0.predict(X), ransac_estimator1.predict(X))
    assert_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X))

    # one-dimensional
    ransac_estimator0.fit(X, y)
    ransac_estimator2.fit(X, y)
    assert_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X))
示例#36
0
 def get_ransac(self, x, y):  # RANSAC регрессор
     ransac = RANSACRegressor(LinearRegression(), residual_threshold=5)
     ransac.fit(x, y)
     return ransac
示例#37
0
    ax.plot(xvals, xvals, 'k--')
    ax.set_xlim([-125, 60])
    ax.set_ylim([-125, 60])
    ax.set_aspect('equal')
    ax.text(0.5,
            1.05,
            'y = {0:0.3f}x + {1:0.3f}'.format(perform_dstBiot['Slope'],
                                              perform_dstBiot['Intercept']),
            horizontalalignment='center',
            transform=ax.transAxes)
    ax.set_xlabel('Sym-H (Observed) [nT]')
    ax.set_ylabel('Sym-H (Modeled) [nT]')
    if False:
        from sklearn.linear_model import TheilSenRegressor, RANSACRegressor
        #tsreg = TheilSenRegressor(random_state=77)
        reg = RANSACRegressor(random_state=77)
        reg.fit(kyotodata['sym-h'][:, np.newaxis], data['dstBiot'][:,
                                                                   np.newaxis])
        y_pred = reg.predict(xvals[:, np.newaxis])
        ax.plot(xvals, y_pred, 'm-', label='RANSAC')
        ax.legend()
    plt.tight_layout()
    plt.savefig('Jan2005_DstCompare_scatter_new.png')

    #now write metrics to log
    logging.info('=================')
    logging.info('===PERFORMANCE===')
    logging.info('=================')
    logging.info('N_points: {}\n'.format(len(data['dstBiot'])))

    if useBiot:
from sklearn import datasets
california = datasets.california_housing.fetch_california_housing()
X, y = california.data, california.target

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
regressors = [
    LinearRegression(),
    RANSACRegressor(),
    KNeighborsRegressor(),
    KNeighborsRegressor(n_neighbors=9, metric='manhattan'),
    SVR(),
    LinearSVR(),
    SVR(kernel='linear'
        ),  # Cf. LinearSVR: much slower, might be better or worse:
    GaussianProcessRegressor(),
]

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from time import time

for model in regressors:
def main():
    parser = argparse.ArgumentParser(
        description=
        'Large-scale Point Cloud Semantic Segmentation with Superpoint Graphs')

    parser.add_argument('--ROOT_PATH', default='datasets/airborne_lidar')
    parser.add_argument('--dataset', default='airborne_lidar')
    # parameters
    parser.add_argument(
        '--compute_geof',
        default=1,
        type=int,
        help='compute hand-crafted features of the local geometry')
    parser.add_argument(
        '--k_nn_local',
        default=20,
        type=int,
        help='number of neighbors to describe the local geometry')
    parser.add_argument('--k_nn_adj',
                        default=5,
                        type=int,
                        help='number of neighbors for the adjacency graph')
    parser.add_argument('--voxel_width',
                        default=0.03,
                        type=float,
                        help='voxel size when subsampling (in m)')
    parser.add_argument('--plane_model',
                        default=1,
                        type=int,
                        help='uses a simple plane model to derive elevation')
    parser.add_argument(
        '--use_voronoi',
        default=0.0,
        type=float,
        help=
        'uses the Voronoi graph in combination to knn to build the adjacency graph, '
        'useful for sparse aquisitions. If 0., do not use voronoi. '
        'If >0, then is the upper length limit for an edge to be kept. ')
    parser.add_argument('--ver_batch',
                        default=5000000,
                        type=int,
                        help='batch size for reading large files')
    args = parser.parse_args()

    # path to data
    if args.ROOT_PATH[-1] == '/':
        root = args.ROOT_PATH
    else:
        root = args.ROOT_PATH + '/'

    if not os.path.exists(root + 'features_supervision'):
        os.makedirs(root + 'features_supervision')

    # list of subfolders to be processed
    if args.dataset == 'airborne_lidar':
        folders = ["trn/", "val/", "tst/"]
        n_labels = 4
    else:
        raise ValueError('%s is an unknown data set' % args.dataset)

    pruning = args.voxel_width > 0
    # ------------------------------------------------------------------------------
    for folder in folders:
        print("=================\n   " + folder + "\n=================")
        data_folder = root + folder
        output_folder = root + "features_supervision/" + folder

        if not os.path.isdir(data_folder):
            raise ValueError(f"{folder} does not exist")

        if not os.path.isdir(output_folder):
            os.mkdir(output_folder)

        if args.dataset == 'airborne_lidar':
            files = glob.glob(data_folder + "*.las")

        if len(files) == 0:
            raise ValueError(f"{folder} is empty")

        n_files = len(files)
        i_file = 0
        for file in files:
            file_name = os.path.splitext(os.path.basename(file))[0]
            data_file = f"{data_folder}{file_name}.las"
            str_file = f"{output_folder}{file_name}.h5"
            i_file = i_file + 1
            print(f"{i_file} / {n_files}---> {file_name}")
            if os.path.isfile(str_file):
                print(
                    "    graph structure already computed - delete for update..."
                )
            else:
                # --- build the geometric feature file h5 file ---
                print("    computing graph structure...")
                # --- read the data files and compute the labels---
                if args.dataset == 'airborne_lidar':
                    xyz, nb_return, intensity, label = read_airborne_lidar_format(
                        data_file)

                if args.dataset == 's3dis':
                    xyz, rgb, labels, objects = read_s3dis_format(data_file)
                    if pruning:
                        n_objects = int(objects.max() + 1)
                        xyz, rgb, labels, objects = libply_c.prune(
                            xyz, args.voxel_width, rgb, labels, objects,
                            n_labels, n_objects)
                        # hard_labels = labels.argmax(axis=1)
                        objects = objects[:, 1:].argmax(axis=1) + 1
                    else:
                        # hard_labels = labels
                        objects = objects
                elif args.dataset == 'sema3d':
                    has_labels = (os.path.isfile(label_file))
                    if has_labels:
                        xyz, rgb, labels = read_semantic3d_format(
                            data_file, n_labels, label_file, args.voxel_width,
                            args.ver_batch)
                    else:
                        xyz, rgb = read_semantic3d_format(
                            data_file, 0, '', args.voxel_width, args.ver_batch)
                        labels = np.array([0])
                        objects = np.array([0])
                        is_transition = np.array(False)
                elif args.dataset == 'vkitti':
                    xyz, rgb, labels = read_vkitti_format(data_file)
                    if pruning:
                        xyz, rgb, labels, o = libply_c.prune(
                            xyz.astype('f4'), args.voxel_width,
                            rgb.astype('uint8'), labels.astype('uint8'),
                            np.zeros(1, dtype='uint8'), n_labels, 0)
                    # ---compute nn graph-------
                n_ver = xyz.shape[0]
                print("computing NN structure")
                graph_nn, local_neighbors = compute_graph_nn_2(
                    xyz,
                    args.k_nn_adj,
                    args.k_nn_local,
                    voronoi=args.use_voronoi)

                if args.dataset == 's3dis':
                    is_transition = objects[graph_nn["source"]] != objects[
                        graph_nn["target"]]
                elif args.dataset == 'sema3d' and has_labels:
                    # sema has no object, we make them ourselves with label inpainting
                    hard_labels = np.argmax(labels[:, 1:], 1) + 1
                    no_labels = (labels[:, 1:].sum(1) == 0).nonzero()
                    hard_labels[no_labels] = 0
                    is_transition = hard_labels[graph_nn["source"]] != hard_labels[graph_nn["target"]] * (hard_labels[graph_nn["source"]] != 0) \
                                    * (hard_labels[graph_nn["target"]] != 0)

                    edg_source = graph_nn["source"][(
                        is_transition == 0).nonzero()].astype('uint32')
                    edg_target = graph_nn["target"][(
                        is_transition == 0).nonzero()].astype('uint32')
                    edge_weight = np.ones_like(edg_source).astype('f4')
                    node_weight = np.ones((n_ver, ), dtype='f4')
                    node_weight[no_labels] = 0
                    print("Inpainting labels")
                    dump, objects = libcp.cutpursuit2(
                        np.array(hard_labels).reshape((n_ver, 1)).astype('f4'),
                        edg_source, edg_target, edge_weight, node_weight, 0.01)
                    is_transition = objects[graph_nn["source"]] != objects[
                        graph_nn["target"]]
                elif args.dataset == 'vkitti':
                    # we define the objects as the constant connected components of the labels
                    hard_labels = np.argmax(labels, 1)
                    is_transition = hard_labels[
                        graph_nn["source"]] != hard_labels[graph_nn["target"]]

                    dump, objects = libply_c.connected_comp(
                        n_ver, graph_nn["source"].astype('uint32'),
                        graph_nn["target"].astype('uint32'),
                        (is_transition == 0).astype('uint8'), 0)

                if args.compute_geof:
                    geof = libply_c.compute_geof(
                        xyz, local_neighbors,
                        args.k_nn_local).astype('float32')
                    geof[:, 3] = 2. * geof[:, 3]
                else:
                    geof = 0

                if args.plane_model:  # use a simple palne model to the compute elevation
                    low_points = (xyz[:, 2] - xyz[:, 2].min() <
                                  0.5).nonzero()[0]
                    reg = RANSACRegressor(random_state=0).fit(
                        xyz[low_points, :2], xyz[low_points, 2])
                    elevation = xyz[:, 2] - reg.predict(xyz[:, :2])
                else:
                    elevation = xyz[:, 2] - xyz[:, 2].min()

                # compute the xy normalized position
                ma, mi = np.max(xyz[:, :2], axis=0,
                                keepdims=True), np.min(xyz[:, :2],
                                                       axis=0,
                                                       keepdims=True)
                xyn = (xyz[:, :2] - mi) / (ma - mi + 1e-8)  # global position

                write_structure(
                    str_file, xyz, rgb, graph_nn,
                    local_neighbors.reshape([n_ver, args.k_nn_local]),
                    is_transition, labels, objects, geof, elevation, xyn)
示例#40
0
    def fit(self, idle_engine_speed, on_engine, temperature_derivatives,
            temperatures, *args):
        """
        Calibrates an engine temperature regression model to predict engine
        temperatures.

        This model returns the delta temperature function of temperature
        (previous), acceleration, and power at the wheel.

        :param idle_engine_speed:
            Engine speed idle median and std [RPM].
        :type idle_engine_speed: (float, float)

        :param on_engine:
            If the engine is on [-].
        :type on_engine: numpy.array

        :param temperature_derivatives:
            Derivative temperature vector [°C].
        :type temperature_derivatives: numpy.array

        :param temperatures:
            Temperature vector [°C].
        :type temperatures: numpy.array

        :return:
            The calibrated engine temperature regression model.
        :rtype: ThermalModel
        """

        spl = _build_samples(temperature_derivatives, temperatures, *args)
        self.thermostat = self._identify_thermostat(spl, idle_engine_speed)

        spl = _filter_samples(spl, on_engine, self.thermostat)
        opt = {
            'random_state': 0,
            'max_depth': 2,
            'n_estimators': int(min(300, 0.25 * (len(spl) - 1))),
            'loss': 'huber',
            'alpha': 0.99
        }
        model = RANSACRegressor(base_estimator=self.base_model(**opt),
                                random_state=0,
                                min_samples=0.85,
                                max_trials=10)

        model = Pipeline([('feature_selection',
                           _SelectFromModel(model,
                                            '0.8*median',
                                            in_mask=(0, 2))),
                          ('classification', model)])
        model.fit(spl[:, :-1], spl[:, -1])

        self.model = model.steps[-1][-1]
        self.mask = np.where(model.steps[0][-1]._get_support_mask())[0]

        self.min_temp = spl[:, 0].min()
        spl = spl[:co2_utl.argmax(self.thermostat <= spl[:, 0])]

        if not spl.any():
            self.min_temp = -float('inf')
            return self
        spl = spl[:co2_utl.argmax(np.percentile(spl[:, 0], 60) <= spl[:, 0])]
        opt = {
            'random_state': 0,
            'max_depth': 2,
            'n_estimators': int(min(300, 0.25 * (len(spl) - 1))),
            'loss': 'huber',
            'alpha': 0.99
        }
        model = self.base_model(**opt)
        model = Pipeline([('feature_selection',
                           _SelectFromModel(model, '0.8*median',
                                            in_mask=(1, ))),
                          ('classification', model)])
        model.fit(spl[:, 1:-1], spl[:, -1])
        self.cold = model.steps[-1][-1]
        self.mask_cold = np.where(
            model.steps[0][-1]._get_support_mask())[0] + 1

        return self
示例#41
0
def test_ransac_min_n_samples():
    base_estimator = LinearRegression()
    ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0)
    ransac_estimator2 = RANSACRegressor(base_estimator,
                                        min_samples=2. / X.shape[0],
                                        residual_threshold=5, random_state=0)
    ransac_estimator3 = RANSACRegressor(base_estimator, min_samples=-1,
                                        residual_threshold=5, random_state=0)
    ransac_estimator4 = RANSACRegressor(base_estimator, min_samples=5.2,
                                        residual_threshold=5, random_state=0)
    ransac_estimator5 = RANSACRegressor(base_estimator, min_samples=2.0,
                                        residual_threshold=5, random_state=0)
    ransac_estimator6 = RANSACRegressor(base_estimator,
                                        residual_threshold=5, random_state=0)
    ransac_estimator7 = RANSACRegressor(base_estimator,
                                        min_samples=X.shape[0] + 1,
                                        residual_threshold=5, random_state=0)

    ransac_estimator1.fit(X, y)
    ransac_estimator2.fit(X, y)
    ransac_estimator5.fit(X, y)
    ransac_estimator6.fit(X, y)

    assert_array_almost_equal(ransac_estimator1.predict(X),
                              ransac_estimator2.predict(X))
    assert_array_almost_equal(ransac_estimator1.predict(X),
                              ransac_estimator5.predict(X))
    assert_array_almost_equal(ransac_estimator1.predict(X),
                              ransac_estimator6.predict(X))

    assert_raises(ValueError, ransac_estimator3.fit, X, y)
    assert_raises(ValueError, ransac_estimator4.fit, X, y)
    assert_raises(ValueError, ransac_estimator7.fit, X, y)
示例#42
0
def main():

    # Checks for correct number of arguments
    if len(sys.argv) != 3:
        print(
            'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]')
        sys.sys.exit()

    # set up dataset
    data_train = pd.read_csv(sys.argv[1])
    data_test = pd.read_csv(sys.argv[2])

    print('train:\n{}\n'.format(sys.argv[1]))
    print('test:\n{}\n'.format(sys.argv[2]))

    if 'small' in sys.argv[1]:
        size = 'small'
    elif 'medium' in sys.argv[1]:
        size = 'medium'
    else:
        size = 'large'

    x_train = data_train.drop(
        [data_train.columns[0], data_train.columns[1], data_train.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_train = pd.Series(data_train.iloc[:, -1])
    x_test = data_test.drop(
        [data_test.columns[0], data_test.columns[1], data_test.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_test = pd.Series(data_test.iloc[:, -1])

    # type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ')
    # if type == 1:
    parameter = None
    method = input('select a method: {}: '.format(methods))
    if method == 1:
        classifier = input('select a classifier: {}: '.format(classifiers))
        if classifier == 1:
            parameter = input('criterion: [1: gini, 2: entropy] ')
            if parameter == 1:
                model = DecisionTreeClassifier(criterion='gini')
                parameter = 'gini'
            elif parameter == 2:
                model = DecisionTreeClassifier(criterion='entropy')
                parameter = 'entropy'
            else:
                print('no criterion chosen')
                sys.exit()
        elif classifier == 2:
            model = ExtraTreeClassifier()
        elif classifier == 3:
            model = ExtraTreesClassifier()
        elif classifier == 4:
            parameter = input('n: [1: 1, 2: 3: 3: 5] ')
            if parameter == 1:
                model = KNeighborsClassifier(n_neighbors=1)
                parameter = '1'
            elif parameter == 2:
                model = KNeighborsClassifier(n_neighbors=3)
                parameter = '3'
            elif parameter == 3:
                model = KNeighborsClassifier(n_neighbors=5)
                parameter = '5'
            else:
                print('no n chosen')
                sys.exit()
        elif classifier == 5:
            parameter = input(
                'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] '
            )
            if parameter == 1:
                model = GaussianNB()
                parameter = 'gaussian'
            elif parameter == 2:
                model = BernoulliNB()
                parameter = 'bernoulli'
            elif parameter == 3:
                model = MultinomialNB()
                parameter = 'multinomial'
            elif parameter == 4:
                model = ComplementNB()
                parameter = 'complement'
            else:
                print('no version chosen')
                sys.exit()
        elif classifier == 6:
            model = RadiusNeighborsClassifier(radius=1.0)
        elif classifier == 7:
            model = RandomForestClassifier(n_estimators=50, random_state=1)
        elif classifier == 8:
            model = LinearSVC(multi_class='crammer_singer')  #multi_class='ovr'
        elif classifier == 9:
            model = GradientBoostingClassifier()
        elif classifier == 10:
            model = GaussianProcessClassifier(multi_class='one_vs_one')
        elif classifier == 11:
            model = SGDClassifier()
        elif classifier == 12:
            model = PassiveAggressiveClassifier()
        elif classifier == 13:
            model = NearestCentroid()
        elif classifier == 14:
            model = Perceptron(tol=1e-3, random_state=0)
        elif classifier == 15:
            model = MLPClassifier()
        elif classifier == 16:
            model = AdaBoostClassifier(n_estimators=50)
        elif classifier == 17:
            parameter = input(
                'strategy: [1: stratified, 2: most frequent, 3: prior, 4: uniform, 5: constant] '
            )
            if parameter == 1:
                model = DummyClassifier(strategy='stratified')
                parameter = 'stratified'
            elif parameter == 2:
                model = DummyClassifier(strategy='most_frequent')
                parameter = 'most frequent'
            elif parameter == 3:
                model = DummyClassifier(strategy='prior')
                parameter = 'prior'
            elif parameter == 4:
                model = DummyClassifier(strategy='uniform')
                parameter = 'uniform'
            elif parameter == 5:
                model = DummyClassifier(strategy='constant')
                parameter = 'constant'
            else:
                print('no strategy selected')
                sys.exit()
        else:
            print('no classifier chosen')
            sys.exit()

        import time
        # Starts timer
        start = time.clock()

        # train the model using the training sets and check score
        model.fit(x_train, y_train)
        model.score(x_train, y_train)

        # predict output
        predictions = pd.Series(model.predict(x_test))
        report = classification_report(
            y_test,
            predictions,
            target_names=['RightTroll', 'LeftTroll', 'Other'])
        confusion = confusion_matrix(
            y_test, predictions, labels=["RightTroll", "LeftTroll", "Other"])
        if (parameter != None):
            filename = '{},{},{},{}.txt'.format(size, methods[method],
                                                classifiers[classifier],
                                                parameter)
        else:
            filename = '{},{},{}.txt'.format(size, methods[method],
                                             classifiers[classifier])

        # Prints the time taken
        end = time.clock()
        time = str(end - start)

        with open(filename, 'w') as output:
            output.write('method:\n{}\n\n'.format(methods[method]))
            output.write('classifier:\n{}\n\n'.format(classifiers[classifier]))
            output.write('accuracy:\n{:.2f}%\n\n'.format(
                100 * accuracy_score(y_test, predictions)))
            output.write('report:\n{}\n\n'.format(report))
            output.write('confusion:\n{}\n\n'.format(confusion))
            output.write('time:\n{}s\n\n'.format(time))
            output.write('data:\n{:10}\t{:10}\t{:10}\n'.format(
                'actual', 'predict', 'match?'))
            for i in range(len(predictions)):
                output.write('{:10}\t{:10}\t{:10}\n'.format(
                    y_train[i], predictions[i], y_test[i] == predictions[i]))

        print('\nmethod:\n{}\n'.format(methods[method]))
        print('classifier:\n{}\n'.format(classifiers[classifier]))
        print('accuracy:\n{:.2f}%\n'.format(
            100 * accuracy_score(y_test, predictions)))
        print('report:\n{}\n'.format(report))
        print('confusion:\n{}\n'.format(confusion))
        print('time: {}s\n'.format(time))

    elif method == 2:
        # transform into binary classification problem
        # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1)
        # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1)

        # transform string labels into integers
        le = LabelEncoder()
        le.fit(
            y_train
        )  # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1]))
        print(le.classes_)

        y_train = le.transform(y_train)
        y_test = le.transform(y_test)

        regressor = input('select a regressor: {}: '.format(regressors))
        if regressor == 1:
            print(method, regressor)
            model = LinearDiscriminantAnalysis()
        elif regressor == 2:
            print(method, regressor)
            model = LogisticRegression(solver='lbfgs',
                                       multi_class='multinomial')  #'newton-cg'
        elif regressor == 3:
            print(method, regressor)
            model = RidgeClassifier()
        elif regressor == 4:
            print(method, regressor)
            model = QuadraticDiscriminantAnalysis()
        elif regressor == 5:
            model = OneVsRestClassifier(LinearRegression())
        elif regressor == 6:
            model = OneVsRestClassifier(DecisionTreeRegressor())
        elif regressor == 7:
            print(method, regressor)
            model = OneVsRestClassifier(Lasso(alpha=0.1))
        elif regressor == 8:
            print(method, regressor)
            model = OneVsRestClassifier(MultiTaskLasso(alpha=0.1))
        elif regressor == 9:
            print(method, regressor)
            model = OneVsRestClassifier(ElasticNet(random_state=0))
        elif regressor == 10:
            print(method, regressor)
            model = OneVsRestClassifier(MultiTaskElasticNet(random_state=0))
        elif regressor == 11:
            print(method, regressor)
            model = OneVsRestClassifier(Lars(n_nonzero_coefs=1))
        elif regressor == 12:
            print(method, regressor)
            model = OneVsRestClassifier(LassoLars(alpha=.1))
        elif regressor == 13:
            print(method, regressor)
            model = OneVsRestClassifier(OrthogonalMatchingPursuit())
        elif regressor == 14:
            print(method, regressor)
            model = OneVsRestClassifier(BayesianRidge())
        elif regressor == 15:
            print(method, regressor)
            model = OneVsRestClassifier(ARDRegression())
        elif regressor == 16:
            print(method, regressor)
            model = OneVsRestClassifier(TheilSenRegressor(random_state=0))
        elif regressor == 17:
            print(method, regressor)
            model = OneVsRestClassifier(HuberRegressor())
        elif regressor == 18:
            print(method, regressor)
            model = OneVsRestClassifier(RANSACRegressor(random_state=0))
        else:
            print('no regressor chosen')
            sys.exit()

        import time
        # Starts timer
        start = time.clock()

        # train the model using the training sets and check score
        model.fit(x_train, y_train)
        model.score(x_train, y_train)

        # y_train = le.inverse_transform(y_train)
        # y_test = le.inverse_transform(y_test)
        # print('coefficient:', model.coef_)
        # print('intercept:', model.intercept_)

        # predict output
        predictions = pd.Series(model.predict(x_test))
        if (parameter != None):
            filename = '{},{},{},{}.txt'.format(size, methods[method],
                                                regressors[regressor],
                                                parameter)
        else:
            filename = '{},{},{}.txt'.format(size, methods[method],
                                             regressors[regressor])

        # Prints the time taken
        end = time.clock()
        time = str(end - start)

        with open(filename, 'w') as output:
            output.write('method:\n{}\n\n'.format(methods[method]))
            output.write('regressor:\n{}\n\n'.format(regressors[regressor]))
            output.write('accuracy:\n{:.2f}%\n\n'.format(
                100 * accuracy_score(y_test, predictions)))
            output.write('time:\n{}s\n\n'.format(time))
            output.write('data:\n{:10}\t{:10}\t{:10}\n'.format(
                'actual', 'predict', 'match?'))
            for i in range(len(predictions)):
                output.write('{:10}\t{:10}\t{:10}\n'.format(
                    y_train[i], predictions[i], y_test[i] == predictions[i]))

        print('\nmethod:\n{}\n'.format(methods[method]))
        print('regressor:\n{}\n'.format(regressors[regressor]))
        print('accuracy:\n{:.2f}%\n'.format(
            100 * accuracy_score(y_test, predictions)))
        print('time: {}s\n'.format(time))

    else:
        print('no method chosen')
        sys.exit()
示例#43
0
# No. of samples: 506
# No. of explanatory variables: 13

df = pd.read_csv('housing.data.txt', header=None, sep='\s+')
df.columns = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
    'PTRATIO', 'B', 'LSTAT', 'MEDV'
]
#print(df.head())

X = df[['RM']].values
y = df['MEDV'].values

ransac = RANSACRegressor(LinearRegression(),
                         max_trials=100,
                         min_samples=50,
                         loss='absolute_loss',
                         residual_threshold=5.0,
                         random_state=0)
ransac.fit(X, y)

# plotting inliers and outliers together with the linear fit
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)
line_X = np.arange(3, 10, 1)
line_y_ransac = ransac.predict(line_X[:, np.newaxis])
plt.scatter(X[inlier_mask],
            y[inlier_mask],
            c='steelblue',
            edgecolor='white',
            marker='o',
            label='Inliers')
    def run(self, trainingDasaset, plotting):
        dataset = trainingDasaset
        accuracy = 0
        y = dataset['int_rate']
        X = dataset.drop(columns=[
            'int_rate',
        ])
        if plotting == True:
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.2,
                                                                random_state=1)
            clf = RANSACRegressor(random_state=42)
            #clf=self.gridSearch(clf,X_train, y_train)
            clf.fit(X_train, y_train)
            print(
                "###################################RANSACRegressor#############################"
            )
            accuracy = clf.score(X_test, y_test)
            #pred = clf.predict(X_test)
            #accuracy = np.sqrt(metrics.mean_squared_error(y_test,pred))
            print("score:" + str(accuracy))

        else:
            clf = RANSACRegressor(random_state=42)
            #clf=self.gridSearch(clf,X_train, y_train)
            clf.fit(X, y)
            testData = pd.read_csv(
                "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/CleanedData/SiameseTrainingData.csv"
            )
            predictions = clf.predict(testData)
            np.savetxt(
                "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/OutputFiles/RANSACRegressorPredictions.csv",
                predictions,
                delimiter=",")

            testData = pd.read_csv(
                "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/CleanedData/OverallTestingData.csv"
            )
            predictions = clf.predict(testData)
            np.savetxt(
                "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/OutputFiles/RANSACRegressorPredictionsTestData.csv",
                predictions,
                delimiter=",")

        return accuracy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix

#loading the dataset
train = pd.read_csv("C:/Users/HP/Desktop/train (1).csv")
test = pd.read_csv("C:/Users/HP/Desktop/test (2).csv")
train = train.dropna()
test = test.dropna()
train.head()

X_train = np.array(train.iloc[:, :-1].values)
y_train = np.array(train.iloc[:, 1].values)
X_test = np.array(test.iloc[:, :-1].values)
y_test = np.array(test.iloc[:, 1].values)

#RANSAC Regressor
from sklearn.linear_model import RANSACRegressor
model = RANSACRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)
plt.plot(X_train, model.predict(X_train), color='r')
plt.show()
print(accuracy)

print(accuracy)
示例#46
0
def get_base_model():
    return {'ransac_regressor': RANSACRegressor()}
示例#47
0
import cv2
from sklearn.linear_model import (
    LinearRegression, TheilSenRegressor, RANSACRegressor, HuberRegressor)
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


np.random.seed(42)



poly = PolynomialFeatures(2)
reg = LinearRegression(PolynomialFeatures(2))

ransac_estimator = RANSACRegressor(reg, min_samples=3,random_state=42)
RANSACRegressor()
regressor = HuberRegressor()
model = make_pipeline(poly, regressor)

x=np.linspace(0, 5, 200)
y=x*x
model.fit(x.reshape(-1, 1), y)
ransac_estimator.fit(x.reshape(-1, 1), y)
#egressor.warm_start = True
#egressor.fit(x.reshape(-1, 1),y)

#rint('oi')
x0 = np.zeros(3)
def fun(x,t, y):
    return x[2] + t*x[1]+ t*t*x[0] -y
示例#48
0
    # Kernel ridge.
    ('KernelRidge', lambda: KernelRidge()),

    # Linear.
    # Way too slow.
    #('ARDRegression', lambda: ARDRegression()),
    ('HuberRegressor', lambda: HuberRegressor()),
    ('LinearRegression', lambda: LinearRegression()),
    # ValueError: Unknown label type: 'continuous'
    #('LogisticRegression', lambda: LogisticRegression()),
    # ValueError: Unknown label type: 'continuous'
    #('LogisticRegressionCV', lambda: LogisticRegressionCV()),
    ('PassiveAggressiveRegressor', lambda: PassiveAggressiveRegressor()),
    # ValueError: Unknown label type: 'continuous'
    #('RandomizedLogisticRegression', lambda: RandomizedLogisticRegression()),
    ('RANSACRegressor', lambda: RANSACRegressor()),
    ('SGDRegressor', lambda: SGDRegressor()),
    # Way too slow.
    #('TheilSenRegressor', lambda: TheilSenRegressor()),

    # Neighbors.
    ('KNeighborsRegressor', lambda: KNeighborsRegressor()),
    # Predicts Nan, infinity or too large of value.
    #('RadiusNeighborsRegressor', lambda: RadiusNeighborsRegressor()),

    # Neural network.
    # Increase max_iter to avoid Warning about non-convergence within max_iter.
    ('MLPRegressor', lambda: MLPRegressor(max_iter=1000)),

    # Support vector machine.
    ('SVR', lambda: SVR()),
示例#49
0
def get_model_obj(modelType, n_clusters=None, **kwargs):
    if modelType == 'knn':
        from sklearn.neighbors import KNeighborsClassifier
        # 6 seems to give the best trade-off between accuracy and precision
        knn = KNeighborsClassifier(n_neighbors=6, **kwargs)
        return knn
    elif modelType == 'gaussianNB':
        from sklearn.naive_bayes import GaussianNB
        gnb = GaussianNB(**kwargs)
        return gnb

    elif modelType == 'multinomialNB':
        from sklearn.naive_bayes import MultinomialNB
        # TODO: figure out how to configure binomial distribution
        mnb = MultinomialNB(**kwargs)
        return mnb

    elif modelType == 'bernoulliNB':
        from sklearn.naive_bayes import BernoulliNB
        bnb = BernoulliNB(**kwargs)
        return bnb

    elif modelType == 'randomForest':
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier(random_state=234, **kwargs)
        return rfc

    elif modelType == 'svm':
        from sklearn.svm import SVC
        svc = SVC(random_state=0, probability=True, **kwargs)
        return svc

    elif modelType == 'LinearRegression':
        #assert column, "Column name required for building a linear model"
        #assert dataframe[column].shape == target.shape
        from sklearn import linear_model
        l_reg = linear_model.LinearRegression(**kwargs)
        return l_reg

    elif modelType == 'RidgeRegression':
        from sklearn.linear_model import Ridge
        if not kwargs:
            kwargs = {'alpha': 0.5}
        ridge_reg = Ridge(**kwargs)
        return ridge_reg

    elif modelType == 'RidgeRegressionCV':
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alphas': [0.1, 1.0, 10.0]}
        ridge_cv_reg = linear_model.RidgeCV(**kwargs)
        return ridge_cv_reg

    elif modelType == 'LassoRegression':
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alpha': 0.1}
        lasso_reg = linear_model.Lasso(**kwargs)
        return lasso_reg

    elif modelType == 'ElasticNetRegression':
        from sklearn.metrics import r2_score
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alpha': 0.1, 'l1_ratio': 0.7}
        enet_reg = linear_model.ElasticNet(**kwargs)
        return enet_reg

    elif modelType == 'LogisticRegression':
        from sklearn.linear_model import LogisticRegression
        log_reg = LogisticRegression(random_state=123, **kwargs)
        return log_reg

    elif modelType == 'RANSACRegression':
        from sklearn.linear_model import LinearRegression, RANSACRegressor
        ransac_model = RANSACRegressor(LinearRegression())
        return ransac_model

    elif modelType == 'kde':
        from sklearn.neighbors.kde import KernelDensity
        kde = KernelDensity(kernel='gaussian', bandwidth=0.2, **kwargs)
        return kde

    elif modelType == 'AR':
        import statsmodels.api as sm
        # fit an AR model and forecast
        ar_fitted = sm.tsa.AR(dataframe).fit(maxlag=9,
                                             method='mle',
                                             disp=-1,
                                             **kwargs)
        #ts_forecast = ar_fitted.predict(start='2008', end='2050')
        return ar_fitted

    elif modelType == 'SARIMAX':
        mod = sm.tsa.statespace.SARIMAX(df.riders,
                                        trend='n',
                                        order=(0, 1, 0),
                                        seasonal_order=(1, 1, 1, 12),
                                        **kwargs)
        return mod

    elif modelType == 'sgd':
        # Online classifiers http://scikit-learn.org/stable/auto_examples/linear_model/plot_sgd_comparison.html
        from sklearn.linear_model import SGDClassifier
        sgd = SGDClassifier(**kwargs)
        return sgd

    elif modelType == 'perceptron':
        from sklearn.linear_model import Perceptron
        perceptron = Perceptron(**kwargs)
        return perceptron

    elif modelType == 'xgboost':
        import xgboost as xgb
        xgbm = xgb.XGBClassifier(**kwargs)
        return xgbm

    elif modelType == 'baseNN':
        from keras.models import Sequential
        from keras.layers import Dense
        # create model
        model = Sequential()
        assert args.get('inputParams', None)
        assert args.get('outputParams', None)
        model.add(Dense(inputParams))
        model.add(Dense(outputParams))
        if args.get('compileParams'):
            # Compile model
            model.compile(
                compileParams
            )  # loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    elif modelType == 'lightGBMRegression':
        from pylightgbm.models import GBMRegressor
        lgbm_lreg = GBMRegressor(num_iterations=100,
                                 early_stopping_round=10,
                                 num_leaves=10,
                                 min_data_in_leaf=10)
        return lgbm_lreg

    elif modelType == 'lightGBMBinaryClass':
        from pylightgbm.models import GBMClassifier
        lgbm_bc = GBMClassifier(metric='binary_error', min_data_in_leaf=1)
        return lgbm_bc

    # Clustering models
    elif modelType == 'KMeans':
        assert n_clusters, "Number of clusters argument mandatory"
        cluster_callable = KMeans
        # seed of 10 for reproducibility.
        clusterer = cluster_callable(n_clusters=n_clusters, random_state=10)
        return clusterer

    elif modelType == 'dbscan':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        cluster_callable = DBSCAN
        clusterer = cluster_callable(eps=0.5)
        return clusterer

    elif modelType == 'affinity_prop':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        clusterer = AffinityPropagation(damping=.9, preference=-200)
        return clusterer
    elif modelType == 'spectral':
        assert n_clusters, "Number of clusters argument mandatory"
        clusterer = SpectralClustering(n_clusters=n_clusters,
                                       eigen_solver='arpack',
                                       affinity="nearest_neighbors")
        return clusterer
    elif modelType == 'birch':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        clusterer = Birch(n_clusters=2)
        return clusterer

    elif modelType == 'agglomerativeCluster':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(dataframe,
                                        n_neighbors=10,
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        clusterer = AgglomerativeClustering(n_clusters=cluster,
                                            linkage='ward',
                                            connectivity=connectivity)
        return clusterer

    elif modelType == 'meanShift':
        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(dataframe, quantile=0.3)
        clusterer = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        return clusterer

    elif modelType == 'gmm':
        from sklearn import mixture
        gmm = mixture.GaussianMixture(n_components=5, covariance_type='full')
        return gmm

    elif modelType == 'dgmm':
        from sklearn import mixture
        dgmm = mixture.BayesianGaussianMixture(n_components=5,
                                               covariance_type='full')
        return dgmm

    else:
        raise 'Unknown model type: see utils.py for available'
#enr_sts_scores = enr.predict(xt[:, np.newaxis])
enr.fit(x, y)
enr_sts_scores = enr.predict(xt)


# Passive Aggressive Regression
print 'passive aggressive'
par = PassiveAggressiveRegressor()
par.fit(x, y)
par_sts_scores = par.predict(xt)
#par.fit(x[:, np.newaxis], y)
#par_sts_scores = par.predict(xt[:, np.newaxis])

# RANSAC Regression
print 'ransac'
ransac = RANSACRegressor()
#ransac.fit(x[:, np.newaxis], y)
#ransac_sts_scores = ransac.predict(xt[:, np.newaxis])
ransac.fit(x, y)
ransac_sts_scores = ransac.predict(xt)


# Logistic Regression
print 'logistic'
lgr = LogisticRegression()
#lgr.fit(x[:, np.newaxis], y)
#lgr_sts_scores = lgr.predict(xt[:, np.newaxis])
lgr.fit(x, y)
lgr_sts_scores = lgr.predict(xt)

示例#51
0
def test_ransac_min_n_samples():
    base_estimator = LinearRegression()
    ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5,  random_state=0)
    ransac_estimator2 = RANSACRegressor(base_estimator,
                                        min_samples=2. / X.shape[0],
                                        residual_threshold=5, random_state=0)
    ransac_estimator3 = RANSACRegressor(base_estimator, min_samples=-1,
                                        residual_threshold=5, random_state=0)
    ransac_estimator4 = RANSACRegressor(base_estimator, min_samples=5.2,
                                        residual_threshold=5, random_state=0)
    ransac_estimator5 = RANSACRegressor(base_estimator, min_samples=2.0,
                                        residual_threshold=5, random_state=0)
    ransac_estimator6 = RANSACRegressor(base_estimator,
                                        residual_threshold=5, random_state=0)
    ransac_estimator7 = RANSACRegressor(base_estimator,
                                        min_samples=X.shape[0] + 1,
                                        residual_threshold=5, random_state=0)

    ransac_estimator1.fit(X, y)
    ransac_estimator2.fit(X, y)
    ransac_estimator5.fit(X, y)
    ransac_estimator6.fit(X, y)

    assert_equal(ransac_estimator1.predict(X), ransac_estimator2.predict(X))
    assert_equal(ransac_estimator1.predict(X), ransac_estimator5.predict(X))
    assert_equal(ransac_estimator1.predict(X), ransac_estimator6.predict(X))
    assert_raises(ValueError, ransac_estimator3.fit, X, y)
    assert_raises(ValueError, ransac_estimator4.fit, X, y)
    assert_raises(ValueError, ransac_estimator7.fit, X, y)
示例#52
0
m = 'meteor'

if m == 'asiya':
    x = np.loadtxt('x.asiya.train')
    y = np.loadtxt('y.asiya.train')
elif m == 'meteor':
    x = np.loadtxt('x.meteor.train')[:,np.newaxis]
    y = np.loadtxt('y.meteor.train')
    x_test = np.loadtxt('x.meteor.test')[:,np.newaxis]

regressors = {'lr':LinearRegression(),
'br':BayesianRidge(compute_score=True),
'enr':ElasticNet(),
'par':PassiveAggressiveRegressor(),
'ransac':RANSACRegressor(),
'lgr':LogisticRegression(),
'svr_rbf':SVR(kernel='rbf', C=1e3, gamma=0.1)}
#'svr_lin':SVR(kernel='linear', C=1e3)}
#'svr_poly':SVR(kernel='poly', C=1e3, degree=2)}

def build_regressors(num):
    rgs = regressors[num]
    rgs.fit(x, y)
    with open(num+'.'+m+'.pk', 'wb') as fid:
        pickle.dump(rgs, fid)

'''
x = x_test
lr = pickle.load(open("lr."+m+'.pk', 'rb'))
br = pickle.load(open("br."+m+'.pk', 'rb'))
示例#53
0
class RansacClass:
    """
    Name      : RANSACRegressor
    Attribute : None
    Method    : predict, predict_by_cv, save_model
    """
    def __init__(self):
        # 알고리즘 이름
        self._name = 'ransac'

        # 기본 경로
        self._f_path = os.path.abspath(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         os.pardir))

        # 경고 메시지 삭제
        warnings.filterwarnings('ignore')

        # 원본 데이터 로드
        data = pd.read_csv(self._f_path +
                           "/regression/resource/regression_sample.csv",
                           sep=",",
                           encoding="utf-8")

        # 학습 및 테스트 데이터 분리
        self._x = (data["year"] <= 2017)
        self._y = (data["year"] >= 2018)

        # 학습 데이터 분리
        self._x_train, self._y_train = self.preprocessing(data[self._x])
        # 테스트 데이터 분리
        self._x_test, self._y_test = self.preprocessing(data[self._y])

        # 모델 선언
        self._model = RANSACRegressor()

        # 모델 학습
        self._model.fit(self._x_train, self._y_train)

    # 데이터 전처리
    def preprocessing(self, data):
        # 학습
        x = []
        # 레이블
        y = []
        # 기준점(7일)
        base_interval = 7
        # 기온
        temps = list(data["temperature"])

        for i in range(len(temps)):
            if i < base_interval:
                continue
            y.append(temps[i])

            xa = []

            for p in range(base_interval):
                d = i + p - base_interval
                xa.append(temps[d])
            x.append(xa)
        return x, y

    # 일반 예측
    def predict(self, save_img=False, show_chart=False):
        # 예측
        y_pred = self._model.predict(self._x_test)

        # 스코어 정보
        score = r2_score(self._y_test, y_pred)

        # 리포트 확인
        if hasattr(self._model, 'coef_') and hasattr(self._model,
                                                     'intercept_'):
            print(f'Coef = {self._model.coef_}')
            print(f'intercept = {self._model.intercept_}')

        print(f'Score = {score}')

        # 이미지 저장 여부
        if save_img:
            self.save_chart_image(y_pred, show_chart)

        # 예측 값  & 스코어
        return [list(y_pred), score]

    #  CV 예측(Cross Validation)
    def predict_by_cv(self):
        # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현
        return False

    #  GridSearchCV 예측
    def predict_by_gs(self):
        pass

    # 모델 저장 및 갱신
    def save_model(self, renew=False):
        # 모델 저장
        if not renew:
            # 처음 저장
            joblib.dump(self._model,
                        self._f_path + f'/model/{self._name}_rg.pkl')
        else:
            # 기존 모델 대체
            if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'):
                os.rename(
                    self._f_path + f'/model/{self._name}_rg.pkl',
                    self._f_path +
                    f'/model/{str(self._name) + str(time.time())}_rg.pkl')
            joblib.dump(self._model,
                        self._f_path + f'/model/{self._name}_rg.pkl')

    # 회귀 차트 저장
    def save_chart_image(self, data, show_chart):
        # 사이즈
        plt.figure(figsize=(15, 10), dpi=100)

        # 레이블
        plt.plot(self._y_test, c='r')

        # 예측 값
        plt.plot(data, c='b')

        # 이미지로 저장
        plt.savefig('./chart_images/tenki-kion-lr.png')

        # 차트 확인(Optional)
        if show_chart:
            plt.show()

    def __del__(self):
        del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
示例#54
0
def test_ransac_residual_metric():
    residual_metric1 = lambda dy: np.sum(np.abs(dy), axis=1)
    residual_metric2 = lambda dy: np.sum(dy**2, axis=1)

    yyy = np.column_stack([y, y, y])

    base_estimator = LinearRegression()
    ransac_estimator0 = RANSACRegressor(base_estimator,
                                        min_samples=2,
                                        residual_threshold=5,
                                        random_state=0)
    ransac_estimator1 = RANSACRegressor(base_estimator,
                                        min_samples=2,
                                        residual_threshold=5,
                                        random_state=0,
                                        residual_metric=residual_metric1)
    ransac_estimator2 = RANSACRegressor(base_estimator,
                                        min_samples=2,
                                        residual_threshold=5,
                                        random_state=0,
                                        residual_metric=residual_metric2)

    # multi-dimensional
    ransac_estimator0.fit(X, yyy)
    ransac_estimator1.fit(X, yyy)
    ransac_estimator2.fit(X, yyy)
    assert_equal(ransac_estimator0.predict(X), ransac_estimator1.predict(X))
    assert_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X))

    # one-dimensional
    ransac_estimator0.fit(X, y)
    ransac_estimator2.fit(X, y)
    assert_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X))
示例#55
0
z = np.linalg.inv(np.dot(Xb.T, Xb))
w = np.dot(z, np.dot(Xb.T, y))

print('Slope: %.3f' % w[1])
print('Intercept: %.3f' % w[0])



# # Fitting a robust regression model using RANSAC




ransac = RANSACRegressor(LinearRegression(), 
                         max_trials=100, 
                         min_samples=50, 
                         loss='absolute_loss', 
                         residual_threshold=5.0, 
                         random_state=0)


ransac.fit(X, y)

inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

line_X = np.arange(3, 10, 1)
line_y_ransac = ransac.predict(line_X[:, np.newaxis])
plt.scatter(X[inlier_mask], y[inlier_mask],
            c='steelblue', edgecolor='white', 
            marker='o', label='Inliers')
plt.scatter(X[outlier_mask], y[outlier_mask],
示例#56
0
def get_model_from_name(model_name, training_params=None):

    # For Keras
    epochs = 250
    if os.environ.get('is_test_suite',
                      0) == 'True' and model_name[:12] == 'DeepLearning':
        print(
            'Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy'
        )
        epochs = 30

    all_model_params = {
        'LogisticRegression': {
            'n_jobs': -2
        },
        'RandomForestClassifier': {
            'n_jobs': -2
        },
        'ExtraTreesClassifier': {
            'n_jobs': -1
        },
        'AdaBoostClassifier': {
            'n_estimators': 10
        },
        'SGDClassifier': {
            'n_jobs': -1
        },
        'Perceptron': {
            'n_jobs': -1
        },
        'LinearRegression': {
            'n_jobs': -2
        },
        'RandomForestRegressor': {
            'n_jobs': -2
        },
        'ExtraTreesRegressor': {
            'n_jobs': -1
        },
        'MiniBatchKMeans': {
            'n_clusters': 8
        },
        'GradientBoostingRegressor': {
            'presort': False
        },
        'SGDRegressor': {
            'shuffle': False
        },
        'PassiveAggressiveRegressor': {
            'shuffle': False
        },
        'AdaBoostRegressor': {
            'n_estimators': 10
        },
        'XGBRegressor': {
            'nthread': -1,
            'n_estimators': 200
        },
        'XGBClassifier': {
            'nthread': -1,
            'n_estimators': 200
        },
        'LGBMRegressor': {},
        'LGBMClassifier': {},
        'DeepLearningRegressor': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'DeepLearningClassifier': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        }
    }

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it)
        model_params.update(training_params)
        print(
            'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:'
        )
        print(model_params)

    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'SGDClassifier': SGDClassifier(),
        'Perceptron': Perceptron(),
        'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),
        'SGDRegressor': SGDRegressor(),
        'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans()
    }

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if keras_installed:
        model_map['DeepLearningClassifier'] = KerasClassifier(
            build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(
            build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print(
            'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize'
        )
        raise (e)
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params