Exemplo n.º 1
0
def test_polynomial_features():
    # Test Polynomial Features
    X1 = np.arange(6)[:, np.newaxis]
    P1 = np.hstack([np.ones_like(X1), X1, X1**2, X1**3])
    deg1 = 3

    X2 = np.arange(6).reshape((3, 2))
    x1 = X2[:, :1]
    x2 = X2[:, 1:]
    P2 = np.hstack([
        x1**0 * x2**0, x1**1 * x2**0, x1**0 * x2**1, x1**2 * x2**0,
        x1**1 * x2**1, x1**0 * x2**2
    ])
    deg2 = 2

    for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]:
        P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X)
        assert_array_almost_equal(P_test, P)

        P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X)
        assert_array_almost_equal(P_test, P[:, 1:])

    interact = PolynomialFeatures(2, interaction_only=True, include_bias=True)
    X_poly = interact.fit_transform(X)
    assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]])
Exemplo n.º 2
0
    def fit(self, data, args):
        self.model = PolynomialFeatures()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval
Exemplo n.º 3
0
    def __gen_model(self, model = LinearRegression()):
        model = Pipeline([('poly', PolynomialFeatures(degree=3)),
            ('linear', LinearRegression(fit_intercept=False))])
        X_train, y_train, _ = self.getDataSet()
        model.fit(X_train, y_train)
#         print "mode coef: ",
#         print model.named_steps['linear'].coef_
        self.model = model
Exemplo n.º 4
0
    def polynomial(self):
        poly = PolynomialFeatures(degree=3)
        self.training_order_start_end_districts_and_time = poly.fit_transform(
            self.training_order_start_end_districts_and_time,
            self.training_number_of_orders)
        predict = poly.transform(
            self.testing_order_start_end_districts_and_time)

        clf = linear_model.LinearRegression()
        clf.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = clf.predict(predict)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(clf.coef_)
Exemplo n.º 5
0
def test_ols_with_boston_dataset():

    # load boston dataset
    dataset = load_boston()

    # create metamodel
    input_names = list(dataset.inputs)
    response_names = list(dataset.responses)
    metamodel = metamodels.OLSModel(
        preprocessors=[PolynomialFeatures(degree=2)],
        input_names=input_names,
        response_names=response_names)

    # create trainer and fit metamodel to the dataset
    result = Trainer().fit(metamodel, dataset)

    print('score:', result.score)

    # score: 0.682539990982
    assert result.score > 0.68
    assert result.score < 0.69
Exemplo n.º 6
0
#selecting based on best performance
#    predictors = np.column_stack((NBO[:,0],sa[:,0],sa[:,1],bv[:,0],CoHOMO[:,0],CoHOMO[:,1],CoHOMO[:,2],CoLUMO[:,0],CoLUMO[:,1],ba[:,0],ba[:,1],ba[:,2],ba[:,3],ba[:,4],ba[:,5],lt[:,0],lt[:,1],lt[:,2],lt[:,4],lt[:,5]))



#######Training targets  ###
#    hydricities = CoHOMO[:,1]
#    hyduns = np.column_stack((therm[:,1])).reshape((-1,1))
#    scaler = StandardScaler()
#    hydricities2 = hydricities.reshape((-1,1))
#    hydricities=scale(hydricities2)
#    print(hyd1)

    # compound features
    polyFeatures = PolynomialFeatures(degree=1,interaction_only=False)
    regressor = make_pipeline(polyFeatures, StandardScaler(), LassoCV(max_iter=60000, cv=KFold(n_splits=5, shuffle=True)))
#    regressor = make_pipeline(polyFeatures, LassoCV(max_iter=60000, cv=KFold(n_splits=5, shuffle=True)))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), LassoCV(max_iter=60000))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), LassoCV(max_iter=60000))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), Ridge())
#    regressor = make_pipeline(polyFeatures, StandardScaler(), Lasso(alpha=0.035, max_iter=70000))#, fit_intercept=True))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), Lasso(alpha=0, max_iter=70000))#, fit_intercept=True))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), LinearRegression())
#    regressor = make_pipeline(polyFeatures, LinearRegression())
#    regressor = RandomForestRegressor(oob_score=True,n_estimators=2000)
    



Exemplo n.º 7
0
boston = load_boston()
#print(boston)
#通过DESCR属性可以查看数据集的详细情况,这里数据有14列,前13列为特征数据,最后一列为标签数据。
#print(boston.DESCR)
#boston的data和target分别存储了特征和标签
#print(boston.data)
#print(boston.target)
#切分数据集
X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                    boston.target,
                                                    test_size=0.2,
                                                    random_state=2)

#增加特征多项式让线性回归模型更好地拟合数据
#多项式的个数的不断增加,可以在训练集上有很好的效果,但很容易造成过拟合
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)
#多项式线性回归
model2 = LinearRegression(normalize=True)
model2.fit(X_train_poly, y_train)
mutilScore = model2.score(X_test_poly, y_test)
print(mutilScore)

#模型测试,并利用均方根误差(MSE)对测试结果进行评价
#模型的拟合值
y_pred = model2.predict(X_test_poly)
print("MSE:", metrics.mean_squared_error(y_test, y_pred))

#交叉验证
predicted = cross_val_predict(model2, boston.data, boston.target, cv=10)
Exemplo n.º 8
0
 def __init__(self, degree):
     self._poly = PolynomialFeatures(degree=degree)
Exemplo n.º 9
0
with open(filename, "r") as filestream:
    for line in filestream:
        current = line.split(",")
        Z.append([i for i in current[0:len(current)]])

#Shuffle Matrix for Cross Validation
Z = np.asarray(Z)
np.random.shuffle(Z)

#Split Matrix in X,Y
X = []
Y = []
X, B, Y = np.hsplit(Z, [Z.shape[1] - 1, Z.shape[1] - 1])

#Non-Linear
X = PolynomialFeatures(1).fit_transform(X)

#Get Float Data
X = X.astype(np.float)
classes = np.unique(Y)
for i in range(0, len(classes)):
    classes[i] = classes[i].strip()
classes = np.unique(classes)

y = []
for i in range(0, len(Y)):
    for j in range(0, len(classes)):
        if (Y[i].item(0).strip() == classes[j]):
            y.append(j)
Y = np.asarray(y)
Y = Y.astype(np.float)
Exemplo n.º 10
0
def validate(params):
    transf_type = params['transf_type']

    if transf_type == 'drop':
        transf = FunctionTransformer(drop_transform, validate=False)
    elif transf_type == 'dr+inp+sc+pca':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
            StandardScaler(),
            PCA(n_components=params['n_pca_components']),
        )
    elif transf_type == 'dr+inp':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
        )
    elif transf_type == 'dr+inp+sc':
        transf = make_pipeline(drop_transform, SimpleImputer(),
                               StandardScaler())
    elif transf_type == 'union':
        transf = create_union_transf(params)
    elif transf_type == 'poly_kbest':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
            StandardScaler(),
            PolynomialFeatures(degree=2, interaction_only=True),
            SelectKBest(f_regression, params['best_features']),
        )
    else:
        raise AttributeError(f'unknown transformer type: {transf_type}')

    est_type = params['est_type']

    if est_type == 'xgboost':
        est = create_xgb_est(params)
    elif est_type == 'gblinear':
        est = create_gblinear_est(params)
    elif est_type == 'exttree':
        est = ExtraTreesRegressor(n_estimators=params['n_estimators'],
                                  n_jobs=-1)
    elif est_type == 'gp':
        est = GaussianProcessRegressor()
    elif est_type == 'ridge':
        est = Ridge(alpha=params['alpha'])
    else:
        raise AttributeError(f'unknown estimator type: {est_type}')

    if params['bagging']:
        BaggingRegressor(est,
                         n_estimators=params['n_bag_estimators'],
                         max_features=1.,
                         max_samples=1.)

    pl = make_pipeline(transf, est)

    if params['per_group_regr']:
        pl = PerGroupRegressor(estimator=pl,
                               split_condition=['os', 'cpuFreq', 'memSize_MB'],
                               n_jobs=1,
                               verbose=1)

    return cv_test(pl, n_folds=params['n_folds'])
Exemplo n.º 11
0
    data.loc[data['Weekday'] == i, 'expensive than average weekday'] = data.loc[data['Weekday'] == i, 'Price'] - \
                                                                      data.loc[data['Weekday'] == i, 'Price'].mean()
for i in range(1, 366):
    data.loc[data['Date'] == i, 'expensive than average date'] = data.loc[data['Date'] == i, 'Price'] - \
                                                                      data.loc[data['Date'] == i, 'Price'].mean()
for i in range(2):
    data.loc[data['Apartment'] == i, 'expensive than average apartment'] = data.loc[data['Apartment'] == i, 'Price'] - \
                                                                      data.loc[data['Apartment'] == i, 'Price'].mean()
for i in range(1, 5):
    data.loc[data['Beds'] == i, 'expensive than average bed'] = data.loc[data['Beds'] == i, 'Price'] - \
                                                                data.loc[data['Beds'] == i, 'Price'].mean()
threshold1 = Binarizer(threshold=3.0)
res1 = pd.DataFrame(threshold1.transform(data['Review'].values.reshape(-1, 1)))
threshold2 = Binarizer(threshold=80)
res2 = pd.DataFrame(threshold2.transform(data['Price'].values.reshape(-1, 1)))
pf = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

res3 = pd.DataFrame(
    pf.fit_transform(
        data[['Apartment', 'Beds', 'Review', 'Pic Quality', 'Price']]))

encoder = OneHotEncoder()
data_region1hot = encoder.fit_transform(data['Region'].values.reshape(-1, 1))
data_region = pd.DataFrame(data_region1hot.toarray())
data_weekday1hot = encoder.fit_transform(data['Weekday'].values.reshape(-1, 1))
data_weekday = pd.DataFrame(data_weekday1hot.toarray())
data_reformed = pd.concat(
    [data.drop(columns=['ID']), data_region, data_weekday, res1, res2, res3],
    axis=1)

Seed = 40
Exemplo n.º 12
0
def create_model(
    model_type,
    feature_scaling=False,
    polynomial_degree=1,
    cross_validation=False,
    alpha=1.0,
    C=None,
    kernel=None,
    svr_epsilon=None,
    svr_degree=None,
    svr_gamma=None,
    svr_coef0=None,
    sparse=False,
):
    """ Creates a new model of the specified type.

    Args:
        model_type (str): The type of model to create. Use one of the MODEL_TYPE_X constants.
        feature_scaling (bool): If feature scaling is to be used.
        polynomial_degree (int): If higher than 1, polynomial feature transformation will be applied.
        cross_validation (bool): If cross validation is to be applied, if applicable to the model type.
        alpha (float): The regularization parameter. Will only be used if applicable to the model type.
        C: The regularization parameter für SVR. Will only be used if applicable to the model type.
        kernel (str): The kernel to use, if applicable to the model type.
        sparse (bool): If a sparse feature matrix is used.
        svr_epsilon (float): Epsilon parameter for SVR. Specifies the epsilon tube. (see sklearn for more info)
        svr_degree (int): Polynomial degree parameter for the SVR kernel 'poly'
        svr_gamma (float): Kernel coefficient for SVR kernels 'rbf', 'poly' and 'sigmoid'
        svr_coef0 (float): Independent term (or bias) for SVR kernels 'poly' and 'sigmoid'

    Returns:
        (sklearn.pipeline.Pipeline) The estimator model.
    """
    assert polynomial_degree > 0, "Polynomial degree must be higher than 0!"
    model_type = model_type.upper()
    logging.debug("Creating model with type %s" % model_type)
    if model_type == MODEL_TYPE_LINREG:
        model = create_linear_regression_model()
    elif model_type == MODEL_TYPE_RIDREG:
        if cross_validation:
            model = create_ridge_cv_model(alpha)
        else:
            model = create_ridge_model(alpha)
    elif model_type == MODEL_TYPE_SVR:
        if cross_validation:
            model = create_svr_cv_model(C, kernel, svr_epsilon, svr_degree,
                                        svr_gamma, svr_coef0)
        else:
            model = create_svr_model(C, kernel, svr_epsilon, svr_degree,
                                     svr_gamma, svr_coef0)
    else:
        raise ValueError("The model type %s is not supported." % model_type)

    steps = []
    if polynomial_degree > 1:
        if not sparse:
            steps.append(
                ("poly", PolynomialFeatures(degree=polynomial_degree)))
        else:
            logging.warning(
                "Polynomial Features for sparse matrices are not supported!")
    if feature_scaling:
        if sparse:
            scaler = SparseScaler()
        else:
            scaler = StandardScaler()
        steps.append(("scale", scaler))
    steps.append((model_type, model))

    return Pipeline(steps)