def test_polynomial_features(): """Test Polynomial Features""" X1 = np.arange(6)[:, np.newaxis] P1 = np.hstack([np.ones_like(X1), X1, X1 ** 2, X1 ** 3]) deg1 = 3 X2 = np.arange(6).reshape((3, 2)) x1 = X2[:, :1] x2 = X2[:, 1:] P2 = np.hstack([x1 ** 0 * x2 ** 0, x1 ** 1 * x2 ** 0, x1 ** 0 * x2 ** 1, x1 ** 2 * x2 ** 0, x1 ** 1 * x2 ** 1, x1 ** 0 * x2 ** 2]) deg2 = 2 for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]: P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X) assert_array_almost_equal(P_test, P) P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X) assert_array_almost_equal(P_test, P[:, 1:]) interact = PolynomialFeatures(2, interaction_only=True, include_bias=True) X_poly = interact.fit_transform(X) assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]]) assert_raises(ValueError, interact.transform, X[:, 1:])
def fit(self, data, args): self.model = PolynomialFeatures() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval
def test_polynomial_features(): # Test Polynomial Features X1 = np.arange(6)[:, np.newaxis] P1 = np.hstack([np.ones_like(X1), X1, X1**2, X1**3]) deg1 = 3 X2 = np.arange(6).reshape((3, 2)) x1 = X2[:, :1] x2 = X2[:, 1:] P2 = np.hstack([ x1**0 * x2**0, x1**1 * x2**0, x1**0 * x2**1, x1**2 * x2**0, x1**1 * x2**1, x1**0 * x2**2 ]) deg2 = 2 for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]: P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X) assert_array_almost_equal(P_test, P) P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X) assert_array_almost_equal(P_test, P[:, 1:]) interact = PolynomialFeatures(2, interaction_only=True, include_bias=True) X_poly = interact.fit_transform(X) assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]])
class PolyFeatureGenerator(TransformerMixin): def __init__(self, degree): self._poly = PolynomialFeatures(degree=degree) def transform(self, df, *_): df_poly = self._poly.transform(df) df_poly = pd.DataFrame(df_poly, columns=self._poly.get_feature_names()) df_poly.index = df.index df = pd.concat([df, df_poly], axis=1) return df def fit(self, df, *_): self._poly.fit(df) return self
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def polynomial(self): poly = PolynomialFeatures(degree=3) self.training_order_start_end_districts_and_time = poly.fit_transform( self.training_order_start_end_districts_and_time, self.training_number_of_orders) predict = poly.transform( self.testing_order_start_end_districts_and_time) clf = linear_model.LinearRegression() clf.fit(self.training_order_start_end_districts_and_time, self.training_number_of_orders) predicted_number_of_orders = clf.predict(predict) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders)**2) print(current_ride_prediction_error) print(clf.coef_)
def __gen_model(self, model = LinearRegression()): model = Pipeline([('poly', PolynomialFeatures(degree=3)), ('linear', LinearRegression(fit_intercept=False))]) X_train, y_train, _ = self.getDataSet() model.fit(X_train, y_train) # print "mode coef: ", # print model.named_steps['linear'].coef_ self.model = model
class PolynomialFeaturesImpl(): def __init__(self, degree=2, interaction_only=False, include_bias=True): self._hyperparams = { 'degree': degree, 'interaction_only': interaction_only, 'include_bias': include_bias } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def transform(self, X): return self._sklearn_model.transform(X)
class CreatePolynomialFeatures(CreateModel): def fit(self, data, args): self.model = PolynomialFeatures() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval def test(self, data): assert self.model is not None return self.model.transform(data.X_test) def predict(self, data): with Timer() as t: self.predictions = self.test(data) data.learning_task = LearningTask.REGRESSION return t.interval
def test_ols_with_boston_dataset(): # load boston dataset dataset = load_boston() # create metamodel input_names = list(dataset.inputs) response_names = list(dataset.responses) metamodel = metamodels.OLSModel( preprocessors=[PolynomialFeatures(degree=2)], input_names=input_names, response_names=response_names) # create trainer and fit metamodel to the dataset result = Trainer().fit(metamodel, dataset) print('score:', result.score) # score: 0.682539990982 assert result.score > 0.68 assert result.score < 0.69
boston = load_boston() #print(boston) #通过DESCR属性可以查看数据集的详细情况,这里数据有14列,前13列为特征数据,最后一列为标签数据。 #print(boston.DESCR) #boston的data和target分别存储了特征和标签 #print(boston.data) #print(boston.target) #切分数据集 X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=2) #增加特征多项式让线性回归模型更好地拟合数据 #多项式的个数的不断增加,可以在训练集上有很好的效果,但很容易造成过拟合 poly = PolynomialFeatures(degree=2, include_bias=False) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.fit_transform(X_test) #多项式线性回归 model2 = LinearRegression(normalize=True) model2.fit(X_train_poly, y_train) mutilScore = model2.score(X_test_poly, y_test) print(mutilScore) #模型测试,并利用均方根误差(MSE)对测试结果进行评价 #模型的拟合值 y_pred = model2.predict(X_test_poly) print("MSE:", metrics.mean_squared_error(y_test, y_pred)) #交叉验证 predicted = cross_val_predict(model2, boston.data, boston.target, cv=10)
vocabulary=wordslist.keys(), ngram_range=(1, 4), stop_words=stopwordlist) X_main = vectorizer.fit_transform(corpus) print "Main Words Shape", X_main.shape vectPoly = TfidfVectorizer(analyzer="word", vocabulary=polywords, ngram_range=(1, 4), use_idf=True, stop_words=stopwordlist) poly = vectPoly.fit_transform(corpus) print "Poly Words Shape", poly.shape polyFeatures = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) X_poly = polyFeatures.fit_transform(poly.todense()) print "Poly Words into Poly Features Shape", X_poly.shape X = np.concatenate((X_main.toarray(), X_poly), axis=1) #X=np.concatenate((X_main.toarray(),power(poly.toarray(),3)),axis=1) print "Matrix Shape", X.shape ''' outfile = open('OUTPUT_6_articles_TFIDF.txt', 'w') for item in X: temp = str(item) outfile.write(temp) outfile.close() '''
with open(filename, "r") as filestream: for line in filestream: current = line.split(",") Z.append([i for i in current[0:len(current)]]) #Shuffle Matrix for Cross Validation Z = np.asarray(Z) np.random.shuffle(Z) #Split Matrix in X,Y X = [] Y = [] X, B, Y = np.hsplit(Z, [Z.shape[1] - 1, Z.shape[1] - 1]) #Non-Linear X = PolynomialFeatures(1).fit_transform(X) #Get Float Data X = X.astype(np.float) classes = np.unique(Y) for i in range(0, len(classes)): classes[i] = classes[i].strip() classes = np.unique(classes) y = [] for i in range(0, len(Y)): for j in range(0, len(classes)): if (Y[i].item(0).strip() == classes[j]): y.append(j) Y = np.asarray(y) Y = Y.astype(np.float)
data = np.loadtxt(dataFile,delimiter=',',skiprows=1,usecols=(16,18,21,24,8,11,12,9,10)) CoHOMO = np.loadtxt(dataFile,delimiter=',',skiprows=2,usecols=(68,70)) therm = np.loadtxt(dataFile,delimiter=',',skiprows=2,usecols=(8,9,10)) predictors = np.column_stack((CoHOMO[:,1])) # predictors = np.column_stack((data[:,0],data[:,2],data[:,3],data[:,4])) #LowdwinH2, Buried, VBuried, pka, r2=0.70951(hyd), r2(h2)=0.2226 # predictors = np.column_stack((data[:,-4],data[:,-3],data[:,4])) #tau,tau,pka r2=0.7004262 # hydricities = data[:,-1] #actually for H2binding hydricities = therm[:,1] # compound features polyFeatures = PolynomialFeatures(degree=1,interaction_only=True) regressor = make_pipeline(polyFeatures, LinearRegression()) # regressor = LinearRegression() regressor.fit(predictors, hydricities) predictions = regressor.predict(predictors) print('R^2: ', regressor.score(predictors, hydricities)) scatterPlot.plotScatterPlot(hydricities, predictions, (Path.home() / 'Desktop' / 'ianPredictions')) # print(regressor.coef_,regressor.intercept_) pass
def validate(params): transf_type = params['transf_type'] if transf_type == 'drop': transf = FunctionTransformer(drop_transform, validate=False) elif transf_type == 'dr+inp+sc+pca': transf = make_pipeline( drop_transform, SimpleImputer(), StandardScaler(), PCA(n_components=params['n_pca_components']), ) elif transf_type == 'dr+inp': transf = make_pipeline( drop_transform, SimpleImputer(), ) elif transf_type == 'dr+inp+sc': transf = make_pipeline(drop_transform, SimpleImputer(), StandardScaler()) elif transf_type == 'union': transf = create_union_transf(params) elif transf_type == 'poly_kbest': transf = make_pipeline( drop_transform, SimpleImputer(), StandardScaler(), PolynomialFeatures(degree=2, interaction_only=True), SelectKBest(f_regression, params['best_features']), ) else: raise AttributeError(f'unknown transformer type: {transf_type}') est_type = params['est_type'] if est_type == 'xgboost': est = create_xgb_est(params) elif est_type == 'gblinear': est = create_gblinear_est(params) elif est_type == 'exttree': est = ExtraTreesRegressor(n_estimators=params['n_estimators'], n_jobs=-1) elif est_type == 'gp': est = GaussianProcessRegressor() elif est_type == 'ridge': est = Ridge(alpha=params['alpha']) else: raise AttributeError(f'unknown estimator type: {est_type}') if params['bagging']: BaggingRegressor(est, n_estimators=params['n_bag_estimators'], max_features=1., max_samples=1.) pl = make_pipeline(transf, est) if params['per_group_regr']: pl = PerGroupRegressor(estimator=pl, split_condition=['os', 'cpuFreq', 'memSize_MB'], n_jobs=1, verbose=1) return cv_test(pl, n_folds=params['n_folds'])
data.loc[data['Weekday'] == i, 'expensive than average weekday'] = data.loc[data['Weekday'] == i, 'Price'] - \ data.loc[data['Weekday'] == i, 'Price'].mean() for i in range(1, 366): data.loc[data['Date'] == i, 'expensive than average date'] = data.loc[data['Date'] == i, 'Price'] - \ data.loc[data['Date'] == i, 'Price'].mean() for i in range(2): data.loc[data['Apartment'] == i, 'expensive than average apartment'] = data.loc[data['Apartment'] == i, 'Price'] - \ data.loc[data['Apartment'] == i, 'Price'].mean() for i in range(1, 5): data.loc[data['Beds'] == i, 'expensive than average bed'] = data.loc[data['Beds'] == i, 'Price'] - \ data.loc[data['Beds'] == i, 'Price'].mean() threshold1 = Binarizer(threshold=3.0) res1 = pd.DataFrame(threshold1.transform(data['Review'].values.reshape(-1, 1))) threshold2 = Binarizer(threshold=80) res2 = pd.DataFrame(threshold2.transform(data['Price'].values.reshape(-1, 1))) pf = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) res3 = pd.DataFrame( pf.fit_transform( data[['Apartment', 'Beds', 'Review', 'Pic Quality', 'Price']])) encoder = OneHotEncoder() data_region1hot = encoder.fit_transform(data['Region'].values.reshape(-1, 1)) data_region = pd.DataFrame(data_region1hot.toarray()) data_weekday1hot = encoder.fit_transform(data['Weekday'].values.reshape(-1, 1)) data_weekday = pd.DataFrame(data_weekday1hot.toarray()) data_reformed = pd.concat( [data.drop(columns=['ID']), data_region, data_weekday, res1, res2, res3], axis=1) Seed = 40
#selecting based on best performance # predictors = np.column_stack((NBO[:,0],sa[:,0],sa[:,1],bv[:,0],CoHOMO[:,0],CoHOMO[:,1],CoHOMO[:,2],CoLUMO[:,0],CoLUMO[:,1],ba[:,0],ba[:,1],ba[:,2],ba[:,3],ba[:,4],ba[:,5],lt[:,0],lt[:,1],lt[:,2],lt[:,4],lt[:,5])) #######Training targets ### # hydricities = CoHOMO[:,1] # hyduns = np.column_stack((therm[:,1])).reshape((-1,1)) # scaler = StandardScaler() # hydricities2 = hydricities.reshape((-1,1)) # hydricities=scale(hydricities2) # print(hyd1) # compound features polyFeatures = PolynomialFeatures(degree=1,interaction_only=False) regressor = make_pipeline(polyFeatures, StandardScaler(), LassoCV(max_iter=60000, cv=KFold(n_splits=5, shuffle=True))) # regressor = make_pipeline(polyFeatures, LassoCV(max_iter=60000, cv=KFold(n_splits=5, shuffle=True))) # regressor = make_pipeline(polyFeatures, StandardScaler(), LassoCV(max_iter=60000)) # regressor = make_pipeline(polyFeatures, StandardScaler(), LassoCV(max_iter=60000)) # regressor = make_pipeline(polyFeatures, StandardScaler(), Ridge()) # regressor = make_pipeline(polyFeatures, StandardScaler(), Lasso(alpha=0.035, max_iter=70000))#, fit_intercept=True)) # regressor = make_pipeline(polyFeatures, StandardScaler(), Lasso(alpha=0, max_iter=70000))#, fit_intercept=True)) # regressor = make_pipeline(polyFeatures, StandardScaler(), LinearRegression()) # regressor = make_pipeline(polyFeatures, LinearRegression()) # regressor = RandomForestRegressor(oob_score=True,n_estimators=2000)
def __init__(self, degree): self._poly = PolynomialFeatures(degree=degree)
def __init__(self, degree=2, interaction_only=False, include_bias=True): self._hyperparams = { 'degree': degree, 'interaction_only': interaction_only, 'include_bias': include_bias} self._wrapped_model = SKLModel(**self._hyperparams)
def create_model( model_type, feature_scaling=False, polynomial_degree=1, cross_validation=False, alpha=1.0, C=None, kernel=None, svr_epsilon=None, svr_degree=None, svr_gamma=None, svr_coef0=None, sparse=False, ): """ Creates a new model of the specified type. Args: model_type (str): The type of model to create. Use one of the MODEL_TYPE_X constants. feature_scaling (bool): If feature scaling is to be used. polynomial_degree (int): If higher than 1, polynomial feature transformation will be applied. cross_validation (bool): If cross validation is to be applied, if applicable to the model type. alpha (float): The regularization parameter. Will only be used if applicable to the model type. C: The regularization parameter für SVR. Will only be used if applicable to the model type. kernel (str): The kernel to use, if applicable to the model type. sparse (bool): If a sparse feature matrix is used. svr_epsilon (float): Epsilon parameter for SVR. Specifies the epsilon tube. (see sklearn for more info) svr_degree (int): Polynomial degree parameter for the SVR kernel 'poly' svr_gamma (float): Kernel coefficient for SVR kernels 'rbf', 'poly' and 'sigmoid' svr_coef0 (float): Independent term (or bias) for SVR kernels 'poly' and 'sigmoid' Returns: (sklearn.pipeline.Pipeline) The estimator model. """ assert polynomial_degree > 0, "Polynomial degree must be higher than 0!" model_type = model_type.upper() logging.debug("Creating model with type %s" % model_type) if model_type == MODEL_TYPE_LINREG: model = create_linear_regression_model() elif model_type == MODEL_TYPE_RIDREG: if cross_validation: model = create_ridge_cv_model(alpha) else: model = create_ridge_model(alpha) elif model_type == MODEL_TYPE_SVR: if cross_validation: model = create_svr_cv_model(C, kernel, svr_epsilon, svr_degree, svr_gamma, svr_coef0) else: model = create_svr_model(C, kernel, svr_epsilon, svr_degree, svr_gamma, svr_coef0) else: raise ValueError("The model type %s is not supported." % model_type) steps = [] if polynomial_degree > 1: if not sparse: steps.append( ("poly", PolynomialFeatures(degree=polynomial_degree))) else: logging.warning( "Polynomial Features for sparse matrices are not supported!") if feature_scaling: if sparse: scaler = SparseScaler() else: scaler = StandardScaler() steps.append(("scale", scaler)) steps.append((model_type, model)) return Pipeline(steps)