def learning_curve(classifier, X, y, cv, sample_sizes, degree=1, pickle_path=None, verbose=True): """ Learning curve """ learning_curves = [] for i, (train_index, test_index) in enumerate(cv): X_train = X[train_index] X_test = X[test_index] y_train = y[train_index] y_test = y[test_index] if degree > 1: poly = PolynomialFeatures(degree=degree, interaction_only=False, include_bias=True) X_train = poly.fit_transform(X_train) X_test = poly.transform(X_test) lc = [] for sample in sample_sizes: classifier.fit(X_train[:sample], y_train[:sample]) # apply classifier on test set y_pred = classifier.predict(X_test) confusion = metrics.confusion_matrix(y_test, y_pred) lc.append(balanced_accuracy_expected(confusion)) learning_curves.append(lc) if verbose: print(i, end=' ') # pickle learning curve if pickle_path: with open(pickle_path, 'wb') as f: pickle.dump(learning_curves, f, protocol=4) if verbose: print()
def get_polynomial_features(df, interaction_sign=' x ', **kwargs): """ Gets polynomial features for the given data frame using the given sklearn.PolynomialFeatures arguments :param df: DataFrame to create new features from :param kwargs: Arguments for PolynomialFeatures :return: DataFrame with labeled polynomial feature values """ pf = PolynomialFeatures(**kwargs) feats = _get_polynomial_features(df.columns.tolist(), pf.fit(df), interaction_sign=interaction_sign) return pd.DataFrame(pf.transform(df), columns=feats)
class PolyFeatures(object): def __init__(self, degree): self.degree = degree self.poly = PolynomialFeatures(self.degree) def fit_transform(self, X): return self.poly.fit_transform(X[:, :]) def transform(self, x): return self.poly.transform(x[:, :])
def polynomialRegression(): import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures X_train = [[6],[8],[10],[14],[18]] y_train = [[7],[9],[13],[17.5],[18]] X_test = [[6],[8],[11],[16]] y_test = [[8],[12],[15],[18]] regressor = LinearRegression() regressor.fit(X_train,y_train) xx = np.linspace(0,26,100) yy = regressor.predict(xx.reshape(xx.shape[0],1)) plt.plot(xx,yy) quadratic_featurizer = PolynomialFeatures(degree=2) X_train_quadratic = quadratic_featurizer.fit_transform(X_train) X_test_quadratic = quadratic_featurizer.transform(X_test) regressor_quadratic = LinearRegression() regressor_quadratic.fit(X_train_quadratic,y_train) xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0],1)) plt.plot(xx,regressor_quadratic.predict(xx_quadratic),c='r',linestyle='--') plt.title("pizza on diameter") plt.xlabel("pizza in inch") plt.ylabel("px in usd") plt.axis([0,25,0,25]) plt.grid(True) plt.scatter(X_train,y_train) plt.show() print X_train print X_train_quadratic print X_test print X_test_quadratic print "simple reg r-squared", regressor.score(X_test,y_test) print "Quadratic regression r-squared", regressor_quadratic.score(X_test_quadratic, y_test)
def test_polynomialfeatures_vs_sklearn(): # Compare msmbuilder.preprocessing.PolynomialFeatures # with sklearn.preprocessing.PolynomialFeatures polynomialfeaturesr = PolynomialFeaturesR() polynomialfeaturesr.fit(np.concatenate(trajs)) polynomialfeatures = PolynomialFeatures() polynomialfeatures.fit(trajs) y_ref1 = polynomialfeaturesr.transform(trajs[0]) y1 = polynomialfeatures.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def _polynomial_features(self, input_df): """Uses Scikit-learn's PolynomialFeatures to construct new degree-2 polynomial features from the existing feature set Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} Input DataFrame to scale Returns ------- modified_df: pandas.DataFrame {n_samples, n_constructed_features + ['guess', 'group', 'class']} Returns a DataFrame containing the constructed features """ training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) if len(training_features.columns.values) == 0: return input_df.copy() elif len(training_features.columns.values) > 700: # Too many features to produce - skip this operator return input_df.copy() # The feature constructor must be fit on only the training data poly = PolynomialFeatures(degree=2, include_bias=False) poly.fit(training_features.values.astype(np.float64)) constructed_features = poly.transform(input_df.drop(['class', 'group', 'guess'], axis=1).values.astype(np.float64)) modified_df = pd.DataFrame(data=constructed_features) modified_df['class'] = input_df['class'].values modified_df['group'] = input_df['group'].values modified_df['guess'] = input_df['guess'].values new_col_names = {} for column in modified_df.columns.values: if type(column) != str: new_col_names[column] = str(column).zfill(10) modified_df.rename(columns=new_col_names, inplace=True) return modified_df.copy()
# Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Training the Polynomial Regression model on the Training set from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression poly_reg = PolynomialFeatures(degree = 4) X_poly = poly_reg.fit_transform(X_train) regressor = LinearRegression() regressor.fit(X_poly, y_train) # Predicting the Test set results y_pred = regressor.predict(poly_reg.transform(X_test)) np.set_printoptions(precision=2) print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) # Evaluating the Model Performance from sklearn.metrics import r2_score r2 = r2_score(y_test, y_pred) print(r2)
# Imprimir cabecalho da lista print ( ' Regressao Simples | Regressao RIDGE | Regressao LASSO ' ) print ( ' ' ) print ( ' Grau Erro IN Erro OUT | Erro IN Erro OUT | Erro IN Erro OUT' ) print ( ' ---- ------- -------- | ------- -------- | ------- --------' ) # Treinar rodar os modelos e preencher a lista for degree in range(1,5): # Transformar atributos originais em atributos polinomiais pf = PolynomialFeatures(degree) X_train_poly = pf.fit_transform(X_train_trans) X_test_poly = pf.transform(X_test_trans) #print(X_train_poly.shape) # Treinar regressores polinomiais lr = LinearRegression() lr = lr.fit(X_train_poly, y_train) # Treinar regressor polinomial com regularizacao Ridge # alpha = 90 lr_ridge = Ridge ( alpha = 90 , max_iter=1000000 ) # 4.E+1 Boston ; lr_ridge = lr_ridge.fit ( X_train_poly , y_train )
yy = regression.predict(xx) import matplotlib.pyplot as plt plt.scatter(X_train, y_train) plt1, = plt.plot(xx, yy, label='Degree=1') print 'The R-squared value of Linear Regression performing on the training data is ', regression.score( X_train, y_train) from sklearn.preprocessing import PolynomialFeatures poly2 = PolynomialFeatures(degree=2) X_train_poly2 = poly2.fit_transform(X_train) regression_poly2 = LinearRegression() regression_poly2.fit(X_train_poly2, y_train) xx_poly2 = poly2.transform(xx) yy_poly2 = regression_poly2.predict(xx_poly2) plt2, = plt.plot(xx, yy_poly2, label='Degree=2') print 'The R-squared value of Polynominal Regressor(Degree=2) performing on the training data is ', regression_poly2.score( X_train_poly2, y_train) poly4 = PolynomialFeatures(degree=4) X_train_poly4 = poly4.fit_transform(X_train) regression_poly4 = LinearRegression() regression_poly4.fit(X_train_poly4, y_train) xx_poly4 = poly4.transform(xx) yy_poly4 = regression_poly4.predict(xx_poly4)
data_set.loc[np.round(data_set['p_i'], 7) == np.round(p, 7)], ignore_index=False) y_test = target.loc[X_test.index] #%% Train, test one-step ahead mode - Linear regression Ridge_model = Ridge(alpha=1E-8, fit_intercept=0) X_tr = X_train.reindex( ['p_f^(n-2)', 'p^(n-2)', 'p_f^(n-1)', 'p^(n-1)', 'p_f^(n)', 'p^(n)'], axis=1) X_te = X_test.reindex( ['p_f^(n-2)', 'p^(n-2)', 'p_f^(n-1)', 'p^(n-1)', 'p_f^(n)', 'p^(n)'], axis=1) poly = PolynomialFeatures(2, include_bias=False).fit(X_tr) ## poly features X_tr_poly = pd.DataFrame(poly.transform(X_tr), columns=poly.get_feature_names(X_tr.columns)) X_te_poly = pd.DataFrame(poly.transform(X_te), columns=poly.get_feature_names(X_te.columns)) ## scalers sc_x = StandardScaler() X_trp_scaled = sc_x.fit_transform(X_tr_poly) X_tep_scaled = sc_x.transform(X_te_poly) sc_y = StandardScaler() y_tr_scaled = sc_y.fit_transform(y_train.to_numpy().reshape((-1, 1))) y_te_scaled = sc_y.transform(y_test.to_numpy().reshape((-1, 1))) scaler = {'sc_x': sc_x, 'sc_y': sc_y} ## fitting Ridge_model.fit(X_trp_scaled, y_tr_scaled)
Y = datas[names[4]] X = X.astype(np.float) Y = Y.astype(np.float) # 对数据集进行测试集合训练集划分 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # 1. 做一个维度多项式扩展的操作 # degree:给定模型做几阶的多项式扩展,也就是转换后的最高次项是多少 poly = PolynomialFeatures(degree=3) # fit_transform:首先使用给定的数据集进行模型训练,找出模型的转换函数,然后使用找出的转换函数对给定的X数据做一个转换操作 X_train = poly.fit_transform(X_train, Y_train) X_test = poly.transform(X_test) # 2. 做一个线性回归 # fit_intercept:是否训练模型的截距项,默认为True,表示训练;如果设置为False,表示不训练。 algo = LinearRegression(fit_intercept=True) # 七、算法模型的训练 algo.fit(X_train, Y_train) # 7.1 查看训练好的模型参数 print("线性回归的各个特征属性对应的权重参数θ:{}".format(algo.coef_)) print("线性回归的截距项的值:{}".format(algo.intercept_)) # 八、模型效果评估 y_hat = algo.predict(X_test) print("在训练集上的模型效果(回归算法中为R2):{}".format(algo.score(X_train, Y_train)))
t_start = time.time() """===================================================================================================================== 0 读取原特征 """ print("0 读取原特征") features_path = '../feature_file/data_w_tfidf(lda+lsa)+doc2vec.pkl' f = open(features_path, 'rb') x_train, y_train, x_test = pickle.load(f) f.close() """===================================================================================================================== 1 使用多项式方法构造出更多的特征 """ print("1 使用多项式方法构造出更多的特征") poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) #degree控制多项式最高次数 x_train_new = poly.fit_transform(x_train) x_test_new = poly.transform(x_test) """===================================================================================================================== 2 将构造好的特征保存至本地 """ print("2 将构造好的特征保存至本地") data = (x_train_new, y_train, x_test_new) features_constr_path = features_path.split('/')[-1] + '_constr.pkl' f_data = open(features_constr_path, 'wb') pickle.dump(data, f_data) f_data.close() t_end = time.time() print("构造特征完成,共耗时:{}min".format((t_end - t_start) / 60))
print ("y1= {0} + {1} x".format(lr_model.intercept_[0], lr_model.coef_[0][0])) xx = np.linspace(0, 26, 100) yy = lr_model.predict(xx.reshape(xx.shape[0], 1)) lr_score = lr_model.score(X_test, y_test) print ("Linear regression (order 1) model score is: {0}".format(lr_score)) plt.plot(xx, yy) plt.plot(X_test, y_test, "o") plt.title("Linear regression (order 1) result") plt.show() poly = PolynomialFeatures(degree=5) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.transform(X_test) lr_5_model = LinearRegression() lr_5_model.fit(X_train_poly, y_train) print ("y2= {0} + {1} x + {2} x*x + {3} x*x*x + {4} x*x*x*x +{5} x*x*x*x*x". format(lr_5_model.intercept_[0], lr_5_model.coef_[0][0], lr_5_model.coef_[0][1], lr_5_model.coef_[0][2], lr_5_model.coef_[0][3], lr_5_model.coef_[0][4])) xx_poly = poly.transform(xx.reshape(xx.shape[0], 1)) yy_poly = lr_5_model.predict(xx_poly) print ("Linear regression (order 5) score is: {0}".format(lr_5_model.score(X_test_poly, y_test))) plt.plot(xx, yy_poly) plt.plot(X_test, y_test, "o")
runplt() plt.plot(y_test, 'k-') ## 建立线性回归,并用训练的模型绘图 regressor = LinearRegression() regressor.fit(X_train, y_train) yy = regressor.predict(X_test) #df_all['LR1'] = pd.Series() #df_all['LR1'][count+1:count+count+1] = yy plt.plot(yy, 'y-') quadratic_featurizer = PolynomialFeatures(degree=2) X_train_quadratic = quadratic_featurizer.fit_transform(X_train) X_test_quadratic = quadratic_featurizer.transform(X_test) regressor_quadratic = LinearRegression() regressor_quadratic.fit(X_train_quadratic, y_train) xx_quadratic = quadratic_featurizer.transform(X_test) plt.plot(regressor_quadratic.predict(xx_quadratic), 'r-') cubic_featurizer = PolynomialFeatures(degree=3) X_train_cubic = cubic_featurizer.fit_transform(X_train) X_test_cubic = cubic_featurizer.transform(X_test) regressor_cubic = LinearRegression() regressor_cubic.fit(X_train_cubic, y_train) xx_cubic = cubic_featurizer.transform(X_test) plt.plot(regressor_cubic.predict(xx_cubic), 'g') seventh_featurizer = PolynomialFeatures(degree=7)
x_train=[[6],[8],[10],[14],[18]] y_train=[[7],[9],[13],[17.5],[18]] x_test = [[6],[8],[11],[16]] y_test = [[8],[12],[15],[18]] regressor = LinearRegression() regressor.fit(x_train,y_train) xx = np.linspace(0,26,100) yy=regressor.predict(xx.reshape(xx.shape[0],1)) plt.plot(xx,yy) quadratic_featurizer = PolynomialFeatures(degree=2) x_train_quadratic = quadratic_featurizer.fit_transform(x_train) x_test_quadratic = quadratic_featurizer.transform(x_test) regressor_quadratic = LinearRegression() regressor_quadratic.fit(x_train_quadratic,y_train) xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0],1)) print(x_train) print(x_train_quadratic) print(x_test) print(x_test_quadratic) print('Simple linear regression r-squared',regressor.score(x_test,y_test)) print('Quadratic regression r-squared',regressor_quadratic.score(x_test_quadratic,y_test))
for iteration in range(iter): gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y) theta = theta - eta * gradients print("BatchGD Thetas:", theta) # A linear model doesn't seem appropriate. Try Polynomial Regression from sklearn.preprocessing import PolynomialFeatures poly_features = PolynomialFeatures(degree=2, include_bias=False) #quadratic X_poly = poly_features.fit_transform(X) # Square each feature and add as feature print('Feature (before):', X[0], ', features (after):', X_poly[0]) lin_reg = linear_model.LinearRegression() lin_reg.fit(X_poly, y) print('lin_reg Thetas:', lin_reg.intercept_, lin_reg.coef_) # Plot our new quadratic model X_new=np.linspace(0, 31, 100).reshape(100, 1) X_new_poly = poly_features.transform(X_new) y_new = lin_reg.predict(X_new_poly) plt.plot(X_new, y_new, "m-", linewidth=2, label="PolynomialSGD") # Now make predictions with each model and compare pred = [[15]] from sklearn.linear_model import SGDRegressor sgd_reg = SGDRegressor(penalty="l2", random_state=42, max_iter = 50, tol=1e-3) #sgd with l2 = Ridge sgd_reg.fit(X, y)#.ravel()) y_new = sgd_reg.predict(X_new) plt.plot(X_new, y_new, "b-", linewidth=2, label="RidgeSGD") print("sgd:", sgd_reg.predict(pred)) from sklearn.linear_model import Ridge #try making alpha very big or very small and compare with Ridge equation
features = train.columns[:-1] test_features = test.columns[1:] for ind, (train_index, test_index) in enumerate(folds): print() print('Fold:', ind) train_train = train.iloc[train_index] train_test = train.iloc[test_index] print('train shape:', train_train.shape) print('test shape:', train_test.shape) poly = PolynomialFeatures(include_bias=False) train_train_poly = poly.fit_transform(train_train[features]) train_test_poly = poly.transform(train_test[features]) pca = PCA(n_components=2) x_train_train_poly = pca.fit_transform(train_train_poly) x_train_test_poly = pca.transform(train_test_poly) train_train.insert(1, 'pca_poly1', x_train_train_poly[:, 0]) train_train.insert(1, 'pca_poly2', x_train_train_poly[:, 1]) train_test.insert(1, 'pca_poly1', x_train_test_poly[:, 0]) train_test.insert(1, 'pca_poly2', x_train_test_poly[:, 1]) features = train.columns[:-1] dtrain_train = xgb.DMatrix(train_train[features], train_train.target.values, silent=True)
fig2 = plt.figure() ax2 = fig2.add_subplot(1,1,1) x = np.linspace(0,100,21).reshape((21,1)) y = np.array(columns_pre_sum).reshape((21,1))/100000 ax2.plot(x,y,'ko-',label = "line chart") #create cubic equation featurizer = PolynomialFeatures(degree = 3 ) x_featurizer = featurizer.fit_transform(x) regressor_featurizer = linear_model.LinearRegression() regressor_featurizer.fit(x_featurizer,y) #Fitting curve xx = np.linspace(0,100,1000) xx_featurizer = featurizer.transform(xx.reshape((1000,1))) yy_featurizer = regressor_featurizer.predict(xx_featurizer) ax2.plot(xx,yy_featurizer,"g-",label = "Curve fitting") ax2.legend(loc = 1) ax2.set_xlabel("The ratio of self-driving cars (%)") ax2.set_ylabel("Daily traffic flow in country of KING$(1e5)$") fig2.savefig(("fitting.png"),dpi = 200) R1_featurizer = regressor_featurizer.score(x_featurizer,y) print "R^2: %f " % R1_featurizer print "Polynomial coefficients: %s" % regressor_featurizer.coef_ print "Polynomials-intercept: %s" % regressor_featurizer.intercept_ #calculate the ratio of unmanned vehicles when the vehicle is the largest yy = list(yy_featurizer.reshape((1,1000))[0]) xx_yy_fea_dict = dict(zip(yy,list(xx)))
def test(): df_all = base.getOneStockData('000002') df_all['volume_diff'] = df_all.volume.pct_change() for index, row in df_all[df_all.volume_diff < 0].iterrows(): df_all.loc[index, 'volume_diff'] = row['volume_diff'] / (1 + row['volume_diff']) df_all['close_diff'] = df_all.close.pct_change() * 100 df_all = df_all.dropna(subset=['volume_diff', 'close_diff']) df_all = df_all[df_all.close_diff < 11] df_all = df_all[df_all.close_diff > -11] df_all = df_all[abs(df_all.volume_diff) > 1] dfx = df_all[['volume_diff']] dfy = df_all[['close_diff']] # X_train = [[6], [8], [10], [14], [18]] # y_train = [[7], [9], [13], [17.5], [18]] # X_test = [[6], [8], [11], [16]] # y_test = [[8], [12], [15], [18]] print df_all.shape count = dfx.shape[0] / 2 - 3 X_train = dfx[:count] y_train = dfy[1:count + 1] X_test = dfx[count:count + count] y_test = dfy[count + 1:count + count + 1] runplt(X_train, y_train, X_test, y_test) plt.plot(X_train, y_train, 'k.') # 建立线性回归,并用训练的模型绘图 regressor = LinearRegression() regressor.fit(X_train, y_train) yy = regressor.predict(X_test) # df_all['LR1'] = pd.Series() # df_all['LR1'][count+1:count+count+1] = yy plt.plot(y_train, yy, 'y-') quadratic_featurizer = PolynomialFeatures(degree=2) X_train_quadratic = quadratic_featurizer.fit_transform(X_train) X_test_quadratic = quadratic_featurizer.transform(X_test) regressor_quadratic = LinearRegression() regressor_quadratic.fit(X_train_quadratic, y_train) xx_quadratic = quadratic_featurizer.transform(X_test) plt.plot(X_test, regressor_quadratic.predict(xx_quadratic), 'r-') cubic_featurizer = PolynomialFeatures(degree=3) X_train_cubic = cubic_featurizer.fit_transform(X_train) X_test_cubic = cubic_featurizer.transform(X_test) regressor_cubic = LinearRegression() regressor_cubic.fit(X_train_cubic, y_train) xx_cubic = cubic_featurizer.transform(X_test) plt.plot(X_test, regressor_cubic.predict(xx_cubic), 'g') seventh_featurizer = PolynomialFeatures(degree=7) X_train_seventh = seventh_featurizer.fit_transform(X_train) X_test_seventh = seventh_featurizer.transform(X_test) regressor_seventh = LinearRegression() regressor_seventh.fit(X_train_seventh, y_train) xx_seventh = seventh_featurizer.transform(X_test) plt.plot(X_test, regressor_seventh.predict(xx_seventh), 'b') plt.plot(X_test, y_test, 'm+') plt.show() # print(X_train_cubic) # print(X_test_cubic) # print(X_train_seventh) # print(X_test_seventh) print('1 r-liner', regressor.score(X_test, y_test)) print('2 r-squared', regressor_quadratic.score(X_test_quadratic, y_test)) print('3 r-squared', regressor_cubic.score(X_test_cubic, y_test)) print('7 r-squared', regressor_seventh.score(X_test_seventh, y_test))
def model_it(mode, training_fraction, polynomial_degree, interactions_only): base_set = integrated_data[integrated_data.type == mode].sort_values(by=['date', 'hits_hour'],ascending=True) base_set['row_num'] = pd.Series(range(0, base_set.shape[0]), index=base_set.index) base_set['cumsum_7_tv_dur'] = pd.Series(base_set['tv_duration_secs'].rolling(window=168, center=False).sum()) base_set['cumsum_7_radio_dur'] = pd.Series(base_set['radio_duration_secs'].rolling(window=168, center=False).sum()) base_set['cumsum_14_tv_dur'] = pd.Series(base_set['tv_duration_secs'].rolling(window=2*168, center=False).sum()) base_set['cumsum_14_radio_dur'] = pd.Series(base_set['radio_duration_secs'].rolling(window=2*168, center=False).sum()) name_list = list(base_set.columns.values.tolist()) lag_names = ['sessions_lag', 'registrations_lag', 'PL_QualStart_lag', 'PL_Submit_lag', 'SLR_QualStart_lag', 'SLR_Submit_lag'] lags = range(1, 7) lags += [24, 48, 72, 96, 120, 144, 168] names = [t + "_" + str(l) for t in lag_names for l in lags] name_list += names lag_vars = ['sessions', 'registrations', 'PL_QualStart', 'PL_Submit', 'SLR_QualStart', 'SLR_Submit'] new_cols = pd.DataFrame() for var in lag_vars: for lag in lags: new_data = base_set[var].shift(lag) new_cols = pd.concat([new_cols, new_data], axis=1, ) enhanced_set = pd.concat([base_set, new_cols], axis=1) enhanced_set.columns = name_list enhanced_set['tv_dur_cumsum'] = enhanced_set['tv_duration_secs'].cumsum() enhanced_set['radio_dur_cumsum'] = enhanced_set['radio_duration_secs'].cumsum() # Create some sine wave bounded from 0 to 1 for hourly predictions since we know traffic is at a low point at 12 AM Sines1 = [((np.sin(((int(x) + 0) / 12.0) * np.pi - np.pi / 2) + 1) / 2) for x in enhanced_set['hits_hour']] Sines2 = [((np.sin(((int(x) + 3) / 12.0) * np.pi - np.pi / 2) + 1) / 2) for x in enhanced_set['hits_hour']] Sines3 = [((np.sin(((int(x) + 6) / 12.0) * np.pi - np.pi / 2) + 1) / 2) for x in enhanced_set['hits_hour']] Sines4 = [((np.sin(((int(x) + 9) / 12.0) * np.pi - np.pi / 2) + 1) / 2) for x in enhanced_set['hits_hour']] enhanced_set['sines1'] = pd.Series(data=Sines1, index=enhanced_set.index) enhanced_set['sines2'] = pd.Series(data=Sines2, index=enhanced_set.index) enhanced_set['sines3'] = pd.Series(data=Sines3, index=enhanced_set.index) enhanced_set['sines4'] = pd.Series(data=Sines4, index=enhanced_set.index) model_data = enhanced_set.dropna() training_limit = int(math.ceil(model_data.shape[0] * training_fraction)) if SinusoidModel: model_cols = names + ['sines1', 'sines2', 'sines3', 'sines4', 'tv_dur_cumsum', 'radio_dur_cumsum', 'cumsum_7_tv_dur', 'cumsum_14_tv_dur', 'cumsum_7_radio_dur', 'cumsum_14_radio_dur'] + shows else: model_cols = names + shows explanatory_vars = model_data[model_cols] explained_vars = model_data[['sessions', 'registrations', 'PL_QualStart', 'PL_Submit', 'SLR_QualStart', 'SLR_Submit']] training_xvar = explanatory_vars[0:training_limit] training_yvar = explained_vars[0:training_limit] poly = PolynomialFeatures(degree=polynomial_degree, interaction_only=interactions_only, include_bias=False) transformed_xvar = poly.fit_transform(training_xvar) target_feature_names = ['x'.join(['{}^{}'.format(pair[0], pair[1]) for pair in tuple if pair[1] != 0]) for tuple in [zip(training_xvar.columns, p) for p in poly.powers_]] transformed_xvar = pd.DataFrame(transformed_xvar, columns=target_feature_names) transformed_explanatory = poly.transform(explanatory_vars) target_feature_names = ['x'.join(['{}^{}'.format(pair[0], pair[1]) for pair in tuple if pair[1] != 0]) for tuple in [zip(explanatory_vars.columns, p) for p in poly.powers_]] transformed_explanatory = pd.DataFrame(transformed_explanatory, columns=target_feature_names) print "data formed, training model using {0} observations and {1} features".format(training_xvar.shape[0], training_xvar.shape[1]) ############################ # # Training # ############################ loop_components = ['type', 'hour'] session_vars = [col for col in transformed_xvar.columns.values if not re.search(pattern=r'PL_QualStart|PL_Submit|SLR_QualStart|SLR_Submit|registrations', string=col)] session_df = transformed_xvar[session_vars] linreg_ARIMA_sessions = fit_formula(x_vars=session_df, y=training_yvar['sessions']) registration_vars = [col for col in transformed_xvar.columns.values if not re.search(pattern=r'sessions|PL_Submit|SLR_QualStart|SLR_Submit|PL_QualStart', string=col)] registration_df = transformed_xvar[registration_vars] linreg_ARIMA_registrations = fit_formula(x_vars=registration_df, y=training_yvar['registrations']) plqs_vars = [col for col in transformed_xvar.columns.values if not re.search(pattern=r'sessions|PL_Submit|SLR_QualStart|SLR_Submit|registrations', string=col)] plqs_df = transformed_xvar[plqs_vars] linreg_ARIMA_PL_QS = fit_formula(x_vars=plqs_df, y=training_yvar['PL_QualStart']) plsub_vars = [col for col in transformed_xvar.columns.values if not re.search(pattern=r'sessions|PL_QualStart|SLR_QualStart|SLR_Submit|registrations', string=col)] plsub_df = transformed_xvar[plsub_vars] linreg_ARIMA_PL_Submit = fit_formula(x_vars=plsub_df, y=training_yvar['PL_Submit']) slrqs_vars = [col for col in transformed_xvar.columns.values if not re.search(pattern=r'sessions|PL_Submit|PL_QualStart|SLR_Submit|registrations', string=col)] slrqs_df = transformed_xvar[slrqs_vars] linreg_ARIMA_SLR_QS = fit_formula(x_vars=slrqs_df, y=training_yvar['SLR_QualStart']) slrsub_vars = [col for col in transformed_xvar.columns.values if not re.search(pattern=r'sessions|PL_Submit|SLR_QualStart|PL_QualStart|registrations', string=col)] slrsub_df = transformed_xvar[slrsub_vars] linreg_ARIMA_SLR_Submit = fit_formula(x_vars=slrsub_df, y=training_yvar['SLR_Submit']) print "{0} traffic ARIMA models trained".format(mode) session_df = transformed_explanatory[session_vars] model_data.loc[:, 'predicted_sessions'] = pd.Series(data=linreg_ARIMA_sessions.predict(session_df), index=model_data.index) plqs_df = transformed_explanatory[plqs_vars] model_data.loc[:, 'predicted_PL_QualStart'] = pd.Series(data=linreg_ARIMA_PL_QS.predict(plqs_df), index=model_data.index) plsub_df = transformed_explanatory[plsub_vars] model_data.loc[:, 'predicted_PL_Submit'] = pd.Series(data=linreg_ARIMA_PL_Submit.predict(plsub_df), index=model_data.index) slrqs_df = transformed_explanatory[slrqs_vars] model_data.loc[:, 'predicted_SLR_QualStart'] = pd.Series(data=linreg_ARIMA_SLR_QS.predict(slrqs_df), index=model_data.index) slrsub_df = transformed_explanatory[slrsub_vars] model_data.loc[:, 'predicted_SLR_Submit'] = pd.Series(data=linreg_ARIMA_SLR_Submit.predict(slrsub_df), index=model_data.index) registration_df = transformed_explanatory[registration_vars] model_data.loc[:, 'predicted_registrations'] = pd.Series(data=linreg_ARIMA_registrations.predict(registration_df), index=model_data.index) # Model statistics model_data.loc[:, 'error_ARIMA'] = model_data['predicted_sessions'] - model_data['sessions'] print("{0} Data metrics".format(mode)) print metrics.r2_score(y_true=model_data['sessions'], y_pred=model_data['predicted_sessions']) print metrics.r2_score(y_true=model_data['registrations'], y_pred=model_data['predicted_registrations']) print metrics.r2_score(y_true=model_data['PL_QualStart'], y_pred=model_data['predicted_PL_QualStart']) print metrics.r2_score(y_true=model_data['PL_Submit'], y_pred=model_data['predicted_PL_Submit']) print metrics.r2_score(y_true=model_data['SLR_QualStart'], y_pred=model_data['predicted_SLR_QualStart']) print metrics.r2_score(y_true=model_data['SLR_Submit'], y_pred=model_data['predicted_SLR_Submit']) # model_data.to_csv("direct_traffic_predictions_new.csv") #out_df = pd.concat(objs=[out_df, model_data], axis=0) return [model_data, (linreg_ARIMA_PL_QS, plqs_df), (linreg_ARIMA_PL_Submit, plsub_df), (linreg_ARIMA_registrations, registration_df), (linreg_ARIMA_sessions, session_df), (linreg_ARIMA_SLR_QS, slrqs_df), (linreg_ARIMA_SLR_Submit, slrsub_df) ]
def feature_engineering(df_train, df_test): df_d_train = feature_engineering_step1(df_train) df_d_test = feature_engineering_step1(df_test) df_d_train_HasAge = df_d_train[df_d_train['HasAge']==1] df_d_test_HasAge = df_d_test[df_d_test['HasAge']==1] df_d_HasAge = pd.concat([df_d_train_HasAge, df_d_test_HasAge]) #df_d_HasAge = df_d_train_HasAge features_age=['Sex_', 'Sex_female','Sex_male', 'Title_Age_s', 'Cabin_s', 'Embarked__C','Embarked__Q','Embarked__S','SibSp_','Parch_','Fare_','Pclass'] X_train = df_d_HasAge[features_age] y_train = df_d_HasAge['Age_'] pca = PCA(n_components=50) poly = PolynomialFeatures(degree=6) lr = LinearRegression(n_jobs=-1) X_train_poly = poly.fit_transform(X_train) X_train_poly = pca.fit_transform(X_train_poly) lr.fit(X_train_poly, y_train) # Predict for all X_predict_train_poly = poly.transform(df_d_train[features_age]) X_predict_train_poly = pca.transform(X_predict_train_poly) df_d_train['Age_P'] = lr.predict(X_predict_train_poly) df_d_train['Age_P'] = df_d_train['Age_P'].apply(lambda x: 0 if x<0 else x).apply(lambda x: 80 if x>80 else x) X_predict_test_poly = poly.transform(df_d_test[features_age]) X_predict_test_poly = pca.transform(X_predict_test_poly) df_d_test['Age_P'] = lr.predict(X_predict_test_poly) df_d_test['Age_P']=df_d_test['Age_P'].apply(lambda x: 0 if x<0 else x).apply(lambda x: 80 if x>80 else x) # Fill in Age_ as Age_P df_d_train.loc[df_d_train['HasAge']==0, ('Age_')]= df_d_train[df_d_train['HasAge']==0]['Age_P'] df_d_test.loc[df_d_test['HasAge']==0, ('Age_')]= df_d_test[df_d_test['HasAge']==0]['Age_P'] del df_d_train['Age_P'] del df_d_test['Age_P'] df_d_train['IsChild'] = df_d_train['Age_'].map(lambda x: 1 if x < 16 else 0) df_d_test['IsChild'] = df_d_test['Age_'].map(lambda x: 1 if x < 16 else 0) df_d_train['Fare_b'] = np.digitize(df_d_train['Fare_'], [0,10,20,30,40]) df_d_test['Fare_b'] = np.digitize(df_d_test['Fare_'], [0,10,20,30,40]) df_d_train['Age_b'] = np.digitize(df_d_train['Age_'], [0,5,10,15,20,25,28,30,35,40,45,50,55,60,65,70]) df_d_test['Age_b'] = np.digitize(df_d_test['Age_'], [0,5,10,15,20,25,28,30,35,40,45,50,55,60,65,70]) df_d_train['AgeCat']=df_d_train['Age_'] df_d_train.loc[ (df_d_train.Age_<=14) ,'AgeCat'] = 'child' df_d_train.loc[ (df_d_train.Age_>60),'AgeCat'] = 'aged' df_d_train.loc[ (df_d_train.Age_>14) & (df_d_train.Age_ <=30) ,'AgeCat'] = 'adult' df_d_train.loc[ (df_d_train.Age_>30) & (df_d_train.Age_ <=60) ,'AgeCat'] = 'senior' df_d_test['AgeCat']=df_d_test['Age_'] df_d_test.loc[ (df_d_test.Age_<=14) ,'AgeCat'] = 'child' df_d_test.loc[ (df_d_test.Age_>60),'AgeCat'] = 'aged' df_d_test.loc[ (df_d_test.Age_>14) & (df_d_test.Age_ <=30) ,'AgeCat'] = 'adult' df_d_test.loc[ (df_d_test.Age_>30) & (df_d_test.Age_ <=60) ,'AgeCat'] = 'senior' return pd.get_dummies(df_d_train), pd.get_dummies(df_d_test)
# Part 4 - Cross validation - Approach 2 import pandas as pd import numpy as np from sklearn import linear_model from sklearn.preprocessing import PolynomialFeatures from sklearn.cross_validation import cross_val_score train_valid_shuffled = pd.read_csv('data/wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict) test = pd.read_csv('data/wk3_kc_house_test_data.csv', dtype=dtype_dict) l2_set = np.logspace(3, 9, num=13) poly15 = PolynomialFeatures(degree= 15) X_train = poly15.fit_transform(train_valid_shuffled['sqft_living'].reshape(-1,1)) X_test = poly15.transform(test['sqft_living'].reshape(-1,1)) y_train = train_valid_shuffled['price'] for i, l2 in enumerate(l2_set): model = linear_model.Ridge(alpha = l2, normalize = True) scores = cross_val_score(model, X_train, y_train, cv=10) print("Using L2 of ", l2, "| Mean score: ", scores.mean()) # -0.000600028584951 # Training on test set: model = linear_model.Ridge(alpha =3.16227766e+03, normalize = True) model.fit(X_train,y_train) y_pred = model.predict(X_test) sum((y_pred - test['price']) ** 2) # 284682323929148
from sklearn.preprocessing import PolynomialFeatures X_train = [[6], [8], [10], [14], [18]] X_test = [[6], [8], [11], [16]] featurizer = PolynomialFeatures(degree=2) X_train = featurizer.fit_transform(X_train) X_test = featurizer.transform(X_test) print X_train print X_test
data =str(raw_input()) m, n=data.split(" ") m=int(m) n=int(n) X=[] Y=[] for i in range(0,n): data2 = str(raw_input()).split(" ") data = [float(x) for x in data2] X.append(data[:m]) Y.append(data[m]) # Now I have to do linear regression poly = PolynomialFeatures(degree=3) X = np.matrix(X) X=poly.fit_transform(X) clf = linear_model.LinearRegression() clf.fit(X,Y) ''' Xt=np.matrix.transpose(X) Fin=(linalg.inv(Xt.dot(X)).dot(Xt)).dot(Y) #print Fin ''' n = int(raw_input()) for i in range(0,n): data2 = str(raw_input()).split(" ") data = np.matrix([[float(x) for x in data2]]) data=poly.transform(data) print clf.predict(data)[0]
plt.close('all') plt.figure(1) plt.scatter(x[:,0], x[:,1], c=y) x, y = make_moons() plt.figure(2) plt.scatter(x[:,0], x[:,1], c=y) # plt.show() from sklearn.preprocessing import PolynomialFeatures # Data Preprocessing routines x = np.asmatrix([[1,2],[2,4]]) poly = PolynomialFeatures(degree = 2) poly.fit(x) x_poly = poly.transform(x) print "Original x variable shape", x.shape print x print print "Transformed x variables", x_poly.shape print x_poly # alternatively x_poly = poly.fit_transform(x) from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris data = load_iris() x = data['data'] y = data['target']
from sklearn.cross_validation import train_test_split from sklearn.ensemble import AdaBoostClassifier from sklearn.preprocessing import PolynomialFeatures # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR') training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25) result1 = tpot_data.copy() # Use Scikit-learn's PolynomialFeatures to construct new features from the existing feature set training_features = result1.loc[training_indices].drop('class', axis=1) if len(training_features.columns.values) > 0 and len(training_features.columns.values) <= 700: # The feature constructor must be fit on only the training data poly = PolynomialFeatures(degree=2, include_bias=False) poly.fit(training_features.values.astype(np.float64)) constructed_features = poly.transform(result1.drop('class', axis=1).values.astype(np.float64)) result1 = pd.DataFrame(data=constructed_features) result1['class'] = result1['class'].values else: result1 = result1.copy() result2 = result1.copy() # Perform classification with an Ada Boost classifier adab2 = AdaBoostClassifier(learning_rate=0.15, n_estimators=500, random_state=42) adab2.fit(result2.loc[training_indices].drop('class', axis=1).values, result2.loc[training_indices, 'class'].values) result2['adab2-classification'] = adab2.predict(result2.drop('class', axis=1).values)
# drop ids and get labels labels = train.target.values train = train.drop("id", axis = 1) train = train.drop("target", axis = 1) test = test.drop("id", axis = 1) # transform counts to TFIDF features tfidf = feature_extraction.text.TfidfTransformer() train = tfidf.fit_transform(train).toarray() test = tfidf.transform(test).toarray() # generate polynomial features poly = PolynomialFeatures() train = poly.fit_transform(train) test = poly.transform(test) #train = np.hstack((train, poly_train)) #test = np.hstack((test, poly_test)) # encode labels lbl_enc = LabelEncoder() labels = lbl_enc.fit_transform(labels) # set up datasets for cross eval x_train, x_test, y_train, y_test = train_test_split(train, labels) # train a DBN classifier clf = DBN([train.shape[1], 8000, 9], learn_rates = 0.3, learn_rate_decays = 0.9, epochs = 50, verbose = 1) # l2_costs = 0.0001, clf.fit(x_train, y_train)
class Features: categories = [ "Pclass", "Embarked" ] def __init__(self): self._train = None self._test = None self.scaler = StandardScaler() self._labels = {} # to encode categorical variables, we use LabelEncoder to turn columns into integers, self._raw_features = {} self.enc = OneHotEncoder(sparse=False) # then OneHotEncoder to turn integers into binary arrays. self.poly = PolynomialFeatures(2) # For example "Embarked C" --> 1 --> [0, 1, 0]. self._is_scaled = False self._is_encoded = False self._means = {} def category_labels(self): labels = [] for category in self.categories: for j in self.labelencoder(category).classes_: labels.append("{:s} {}".format(category, j)) return labels def feature_labels(self): return ["gender", "age", "siblings and spouses", "parents and children", "fare"] + self.category_labels() @property def feature_funcs(self): return [self.gender_func, self.float_col("Age"), self.float_col("SibSp"), self.float_col("Parch"), self.float_col("Fare"), self.category_cols, self.poly_age_class ] def _encode(self): if not self._is_encoded: self._is_encoded = True for cat in self.categories: self.train[cat] = self.labelencoder(cat).transform(self.train[cat].values) self.enc.fit(self.train[self.categories].values) def labelencoder(self, col): if col not in self._labels: self._labels[col] = LabelEncoder().fit(self.train[col].values) return self._labels[col] def label_col(self, row, col): return self.labelencoder(col).transform(row[col]) def mean_col(self, col): if col not in self._means: self._means[col] = numpy.mean([float(j[col]) for (idx, j) in self.train.iterrows() if j[col]]) return self._means[col] def float_col(self, col): def func(row): try: return [float(row[col])] except ValueError: return [self.mean_col(col)] return func def poly_age_class(self, row): ''' poly_age_class takes in the age and class of the passenger and creates a list of degree-2 polynomial features ''' klass = self.label_col(row, 'Pclass') age = row['Age'] return [x for x in self.poly.transform([klass, age])[0]] def category_cols(self, row): self._encode() try: val = [self.label_col(row, cat) for cat in self.categories] return self.enc.transform([val]).tolist()[0] except ValueError, e: print '\n\n*** ERROR: caught value error', e, '***\n\n' print 'row:\n', row sys.exit(1)
plt.scatter(features, labels, color='red') plt.plot(features, lin_reg_1.predict(features), color='blue') plt.title('Linear Regression') plt.xlabel('Year') plt.ylabel('Claims Paid') plt.show() # Fitting Polynomial Regression to the dataset from sklearn.preprocessing import PolynomialFeatures poly_object = PolynomialFeatures(degree=5) features_poly = poly_object.fit_transform(features) lin_reg_2 = LinearRegression() lin_reg_2.fit(features_poly, labels) print("Predicting result with Polynomial Regression") print(lin_reg_2.predict(poly_object.transform(1981))) # Visualising the Polynomial Regression results plt.scatter(features, labels, color='red') plt.plot(features, lin_reg_2.predict(poly_object.fit_transform(features)), color='blue') plt.title('Polynomial Regression') plt.xlabel('Year') plt.ylabel('Claims Paid') plt.show() """ https://towardsdatascience.com/polynomial-regression-bbe8b9d97491 """
import numpy as np from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1)) y = np.array([15, 11, 2, 8, 25, 32]) transformer = PolynomialFeatures(degree=2, include_bias=False) transformer.fit(x) x_ = transformer.transform(x) # x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x) print(x_) model = LinearRegression().fit(x_, y) r_sq = model.score(x_, y) intercept, coefficients = model.intercept_, model.coef_ y_pred = model.predict(x_) print(y_pred)
clf = LinearRegression() clf.fit(x,y) pre = clf.predict([[12]])[0] print(u'预测直径为12英寸的价格: $%.2f' % pre) x2 = [[0],[12],[15],[25]] y2 = clf.predict(x2) import matplotlib.pyplot as plt import numpy as np plt.figure() plt.axis([0,25,0,25]) plt.scatter(x,y,marker="s",s=20) plt.plot(x2,y2,"g-") #导入多项式回归模型 from sklearn.preprocessing import PolynomialFeatures xx = np.linspace(0,25,100) #0到25等差数列 quadratic_featurizer = PolynomialFeatures(degree = 2) #实例化一个二次多项式 x_train_quadratic = quadratic_featurizer.fit_transform(x) #用二次多项式多样本x做变换 X_test_quadratic = quadratic_featurizer.transform(x2) regressor_quadratic = LinearRegression() regressor_quadratic.fit(x_train_quadratic, y) xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))# 把训练好X值的多项式特征实例应用到一系列点上,形成矩阵 plt.plot(xx, regressor_quadratic.predict(xx_quadratic), label="$y = ax^2 + bx + c$",linewidth=2,color="r") plt.legend() plt.show()
def prep4WnD(data, label = None): data[['방송월', '방송시간(시간)', '방송시간(분)']] = data[['방송월', '방송시간(시간)', '방송시간(분)']].astype(int) data = data.merge(scale_timeS, on = ['방송월', '방송일'], how = 'left').fillna(0) data = data.merge(scale_timeY, on = ['방송월', '방송일', '방송시간(시간)'], how = 'left').fillna(0) data = data.merge(scale_timeR, on = ['방송시간(시간)'], how = 'left').fillna(0) data = data.merge(volume_v3, on = '방송시간(시간)', how = 'left') data = data.merge(volume_v4, on = ['방송월', '방송일', '방송시간(시간)'], how = 'left') data = data.fillna(0) data = data.merge(rate_v1[['방송월', '방송일', '일별평균시청률']], on = ['방송월', '방송일'], how = 'left') data['일별시간별최대시청률'] = None data['일별시간별평균시청률'] = None data['일별시간별중간시청률'] = None for m, d, h in tqdm(data[['방송월', '방송일', '방송시간(시간)']].drop_duplicates().values): max_r = rate_v3.loc[(rate_v3['방송월'] == m) & (rate_v3['방송일'] == d), h].values[0] min_r = rate_v4.loc[(rate_v4['방송월'] == m) & (rate_v4['방송일'] == d), h].values[0] med_r = rate_v5.loc[(rate_v5['방송월'] == m) & (rate_v5['방송일'] == d), h].values[0] data.loc[(data['방송월'] == m) & (data['방송일'] == d) & (data['방송시간(시간)'] == h), ['일별시간별최대시청률', '일별시간별평균시청률', '일별시간별중간시청률']] = [max_r, min_r, med_r] data['시간별월별최대시청률'] = None data['시간별월별평균시청률'] = None data['시간별월별중간시청률'] = None for m,h in tqdm(data[['방송월', '방송시간(시간)']].drop_duplicates().values): max_r = rate_v6.loc[(rate_v6['방송시간(시간)'] == h), m].values[0] min_r = rate_v7.loc[(rate_v7['방송시간(시간)'] == h), m].values[0] med_r = rate_v8.loc[(rate_v8['방송시간(시간)'] == h), m].values[0] data.loc[(data['방송월'] == m) & (data['방송시간(시간)'] == h), ['시간별월별최대시청률', '시간별월별평균시청률', '시간별월별중간시청률']] = [max_r, min_r, med_r] data = data.merge(volume_v1, on = ['방송월', '방송시간(시간)'], how = 'left') data = data.merge(volume_v2, on = ['방송시간(시간)'], how = 'left') X = data[COLUMNS] for c in CATEGORICAL_COLUMNS: le = LabelEncoder() X[c] = le.fit_transform(X[c]) if args.dataset == 'train': label = pd.get_dummies(label).values from sklearn.model_selection import train_test_split x_train, x_valid, y_train, y_valid = train_test_split(X, label, test_size = 0.2, random_state = 42) x_train_category = np.array(x_train[CATEGORICAL_COLUMNS]) x_valid_category = np.array(x_valid[CATEGORICAL_COLUMNS]) x_train_continue = np.array(x_train[CONTINUOUS_COLUMNS], dtype = 'float64') x_valid_continue = np.array(x_valid[CONTINUOUS_COLUMNS], dtype = 'float64') scaler = MinMaxScaler() x_train_continue = scaler.fit_transform(x_train_continue) x_valid_continue = scaler.transform(x_valid_continue) poly = PolynomialFeatures(degree=2, interaction_only=True) x_train_category_poly = poly.fit_transform(x_train_category) x_valid_category_poly = poly.transform(x_valid_category) joblib.dump(scaler, os.path.join('..', 'data', '04_임시데이터', 'scaler4rec.pkl')) data4train = (x_train_continue, x_train_category, x_train_category_poly, y_train) data4valid = (x_valid_continue, x_valid_category, x_valid_category_poly, y_valid) return X, data4train, data4valid elif args.dataset == 'test': X_category = np.array(X[CATEGORICAL_COLUMNS]) X_continue = np.array(X[CONTINUOUS_COLUMNS], dtype = 'float64') scaler = joblib.load(os.path.join('..', 'data', '04_임시데이터', 'scaler4rec.pkl')) X_continue = scaler.fit_transform(X_continue) poly = PolynomialFeatures(degree=2, interaction_only=True) X_category_poly = poly.fit_transform(X_category) data4test = (X_continue, X_category, X_category_poly) return X, data4test
import matplotlib.pyplot as plt import numpy as np # 导入mglearn模块 import sys sys.path.append("../") import mglearn X, y = mglearn.datasets.make_wave(n_samples=100) line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1) # include polynomials up to x ** 10: # the default "include_bias=True" adds a feature that's constantly 1 poly = PolynomialFeatures(degree=10, include_bias=False) poly.fit(X) X_poly = poly.transform(X) print("X_poly.shape: {}".format(X_poly.shape)) print("Entries of X:\n{}".format(X[:5])) print("Entries of X_poly:\n{}".format(X_poly[:5])) print("Polynomial feature names:\n{}".format(poly.get_feature_names())) reg = LinearRegression().fit(X_poly, y) line_poly = poly.transform(line) plt.plot(line, reg.predict(line_poly), label='polynomial linear regression') plt.plot(X[:, 0], y, 'o', c='k') plt.ylabel("Regression output") plt.xlabel("Input feature") plt.legend(loc="best") plt.show()
poly_features = poly_features.drop(columns=['TARGET']) # Need to impute missing values poly_features = imputer.fit_transform(poly_features) poly_features_test = imputer.transform(poly_features_test) from sklearn.preprocessing import PolynomialFeatures # Create the polynomial object with specified degree poly_transformer = PolynomialFeatures(degree=3) poly_transformer.fit(poly_features) # Transform the features poly_features = poly_transformer.transform(poly_features) poly_features_test = poly_transformer.transform(poly_features_test) print('Polynomial Features shape: ', poly_features.shape) poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])[:15] poly_features = pd.DataFrame(poly_features, columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])) # Add in the target poly_features['TARGET'] = poly_target # Find the correlations with the target poly_corrs = poly_features.corr()['TARGET'].sort_values() # Display most negative and most positive
print(f'PolynomialRegressor save not found. Training...') regr = LinearRegression() regr.fit(X_poly_train, y_train) pickle.dump(regr, open(PATH, 'wb')) #### fit hardcoded model ##### # degree=1 | MSE is 0.124031 # degree=2 | MSE is 0.118514 # degree=3 | MSE is 0.134984 | 5000 random features # X_poly_train = np.array(X_poly_train) # size = X_poly_train.shape[1] # idx = np.random.random_integers(0, size, 5000) # X_poly_train = X_poly_train[:, idx] # print(f'X_poly_train {X_poly_train.shape}') y_pred = regr.predict(poly_reg.transform(X_test)) # [:, idx] mse = mean_squared_error(y_test, y_pred) print(f'MSE for PolynomialRegressor is {mse:.12f}\n') def visualization(start, end, X_test=X_test, y_test=y_test): y_pred = regr.predict(poly_reg.transform(X_test[start:end])) y_test = np.array(y_test) # print(y_test[start:end]) print(y_test[start:end, 0]) plt.figure(figsize=(10, 10)) plt.plot(y_pred[start:end,0], y_pred[start:end,1], color='r', label='x,y of y_pred') plt.plot(y_test[start:end,0], y_test[start:end,1], color='k', label='x,y of X_test') plt.show() # print(f'y_test\n{np.array(y_test)[-100:]}') #### Visualization ####
X = np.array([race["馬番"], race["斤量"], race["単勝"], race["人気"], race["前着順"], race["前馬番"], race["前人気"]]).T Y = np.array(race["着順"]) Z = np.array(race["単勝オッズ"]) import sklearn.model_selection from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import Ridge result = 0 today = pd.read_csv('kikka.csv', index_col=0) mm = preprocessing.MinMaxScaler() # インスタンスの作成 today_seiki = mm.fit_transform(today) X_train, X_test, Y_train, Y_test, Z_train, Z_test = sklearn.model_selection.train_test_split(X, Y, Z) poly = PolynomialFeatures(degree=2).fit(X_train) X_train_poly = poly.transform(X_train) X_test_poly = poly.transform(X_test) today_poly = poly.transform(today_seiki) ridge = Ridge().fit(X_train_poly, Y_train) print(f"X_train_poly.shape : {X_train_poly.shape}") print(f'Score with polynomial features : {ridge.score(X_test_poly, Y_test):.3f}') print('-'*50) print(ridge.predict(today_poly))
def homework(): df = pd.read_csv(utils.PATH.COURSE_FILE(2, 'data.csv')) print(df.shape) #print(df.head()) #print(df.info()) X = df.drop('Grant.Status', axis=1) y = df['Grant.Status'] numeric_cols = ['RFCD.Percentage.1', 'RFCD.Percentage.2', 'RFCD.Percentage.3', 'RFCD.Percentage.4', 'RFCD.Percentage.5', 'SEO.Percentage.1', 'SEO.Percentage.2', 'SEO.Percentage.3', 'SEO.Percentage.4', 'SEO.Percentage.5', 'Year.of.Birth.1', 'Number.of.Successful.Grant.1', 'Number.of.Unsuccessful.Grant.1'] categorical_cols = list(set(X.columns.values.tolist()) - set(numeric_cols)) X_real_zeros = X[numeric_cols].fillna(0.0) X_real_means = X[numeric_cols].fillna(X.mean()) X_cat = X[categorical_cols].fillna('NA').applymap(str) encoder = DictVectorizer(sparse=False) X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values()) print(X_cat_oh.shape) X_train_real_zeros, X_test_real_zeros, \ y_train, y_test = train_test_split(X_real_zeros, y, test_size=0.3, random_state=0) X_train_real_means, X_test_real_means, \ y_train, y_test = train_test_split(X_real_means, y, test_size=0.3, random_state=0) X_train_cat_oh, X_test_cat_oh, \ y_train, y_test = train_test_split(X_cat_oh, y, test_size=0.3, random_state=0) X_train_zeros = np.hstack([X_train_real_zeros.values, X_train_cat_oh]) X_test_zeros = np.hstack([X_test_real_zeros.values, X_test_cat_oh]) X_train_means = np.hstack([X_train_real_means.values, X_train_cat_oh]) X_test_means = np.hstack([X_test_real_means.values, X_test_cat_oh]) def task_1(): alg = LogisticRegression() params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]} grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1) grid.fit(X_train_zeros, y_train) print(grid.best_params_, grid.best_score_) y_pred = grid.predict_proba(X_test_zeros)[:, 1] auc_1 = roc_auc_score(y_test, y_pred) print('ROC AUC on zeroes:', auc_1) #plot_scores(grid) grid.fit(X_train_means, y_train) print(grid.best_params_, grid.best_score_) y_pred = grid.predict_proba(X_test_means)[:, 1] auc_2 = roc_auc_score(y_test, y_pred) print('ROC AUC on zeroes:', auc_2) #plot_scores(grid) write_answer_1(auc_2, auc_1) #task_1() ##### Scaling ##### scaler = StandardScaler() X_train_real_scaled = scaler.fit_transform(X_train_real_zeros) X_test_real_scaled = scaler.transform(X_test_real_zeros) #data_numeric = pd.DataFrame(X_train_real_scaled, columns=numeric_cols) #list_cols = ['Number.of.Successful.Grant.1', 'SEO.Percentage.2', 'Year.of.Birth.1'] #scatter_matrix(data_numeric[list_cols], alpha=0.5, figsize=(10, 10)) #plt.show() X_train_scaled = np.hstack([X_train_real_scaled, X_train_cat_oh]) X_test_scaled = np.hstack([X_test_real_scaled, X_test_cat_oh]) def task_2(): alg = LogisticRegression() params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]} grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1) grid.fit(X_train_scaled, y_train) print(grid.best_params_, grid.best_score_) y_pred = grid.predict_proba(X_test_scaled)[:, 1] auc = roc_auc_score(y_test, y_pred) print('ROC AUC on scaled zeroes:', auc) #plot_scores(grid) write_answer_2(auc_2) return #task_2() def example(): np.random.seed(0) param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]} cv = 3 """Сэмплируем данные из первой гауссианы""" data_0 = np.random.multivariate_normal([0,0], [[0.5,0],[0,0.5]], size=40) """И из второй""" data_1 = np.random.multivariate_normal([0,1], [[0.5,0],[0,0.5]], size=40) """На обучение берём 20 объектов из первого класса и 10 из второго""" example_data_train = np.vstack([data_0[:20,:], data_1[:10,:]]) example_labels_train = np.concatenate([np.zeros((20)), np.ones((10))]) """На тест - 20 из первого и 30 из второго""" example_data_test = np.vstack([data_0[20:,:], data_1[10:,:]]) example_labels_test = np.concatenate([np.zeros((20)), np.ones((30))]) """Задаём координатную сетку, на которой будем вычислять область классификации""" xx, yy = np.meshgrid(np.arange(-3, 3, 0.02), np.arange(-3, 3, 0.02)) """Обучаем регрессию без балансировки по классам""" optimizer = GridSearchCV(LogisticRegression(), param_grid, cv=cv, n_jobs=-1) optimizer.fit(example_data_train, example_labels_train) """Строим предсказания регрессии для сетки""" Z = optimizer.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel2) plt.scatter(data_0[:,0], data_0[:,1], color='red') plt.scatter(data_1[:,0], data_1[:,1], color='blue') """Считаем AUC""" auc_wo_class_weights = roc_auc_score(example_labels_test, optimizer.predict_proba(example_data_test)[:,1]) plt.title('Without class weights') plt.show() print('AUC: %f'%auc_wo_class_weights) """Для второй регрессии в LogisticRegression передаём параметр class_weight='balanced'""" optimizer = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, cv=cv, n_jobs=-1) optimizer.fit(example_data_train, example_labels_train) Z = optimizer.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel2) plt.scatter(data_0[:,0], data_0[:,1], color='red') plt.scatter(data_1[:,0], data_1[:,1], color='blue') auc_w_class_weights = roc_auc_score(example_labels_test, optimizer.predict_proba(example_data_test)[:,1]) plt.title('With class weights') plt.show() print('AUC: %f'%auc_w_class_weights) #example() def task_3(): alg = LogisticRegression(class_weight='balanced') params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]} grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1) grid.fit(X_train_scaled, y_train) print(grid.best_params_, grid.best_score_) y_pred = grid.predict_proba(X_test_scaled)[:, 1] auc_1 = roc_auc_score(y_test, y_pred) print('ROC AUC on scaled zeroes:', auc_1) #plot_scores(grid) ## Balanced np.random.seed(0) n0 = sum(y_train==0) n1 = sum(y_train==1) print(n0, n1) y_less = np.nonzero(y_train)[0] indices_to_add = y_less[np.random.randint(0, len(y_less), n0 - n1)] X_train_to_add = X_train_scaled[indices_to_add, :] y_train_to_add = y_train.values[indices_to_add] X_train_balanced = np.vstack([X_train_scaled, X_train_to_add]) y_train_balanced = np.hstack([y_train, y_train_to_add]) alg = LogisticRegression() params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]} grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1) grid.fit(X_train_balanced, y_train_balanced) print(grid.best_params_, grid.best_score_) y_pred = grid.predict_proba(X_test_scaled)[:, 1] auc_2 = roc_auc_score(y_test, y_pred) print('ROC AUC on scaled balanced zeroes:', auc_2) write_answer_3(auc_1, auc_2) return #task_3() X_train_real_zeros, X_test_real_zeros, \ y_train, y_test = train_test_split(X_real_zeros, y, test_size=0.3, random_state=0, stratify=y) X_train_cat_oh, X_test_cat_oh, \ y_train, y_test = train_test_split(X_cat_oh, y, test_size=0.3, random_state=0, stratify=y) scaler = StandardScaler() X_train_real_scaled = scaler.fit_transform(X_train_real_zeros) X_test_real_scaled = scaler.transform(X_test_real_zeros) X_train_scaled = np.hstack([X_train_real_scaled, X_train_cat_oh]) X_test_scaled = np.hstack([X_test_real_scaled, X_test_cat_oh]) def task_4(): alg = LogisticRegression(class_weight='balanced') params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]} grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1) grid.fit(X_train_scaled, y_train) print(grid.best_params_, grid.best_score_) y_pred = grid.predict_proba(X_test_scaled)[:, 1] auc = roc_auc_score(y_test, y_pred) print('ROC AUC on scaled y-stratified zeroes:', auc) write_answer_4(auc) return #task_4() def example_2(): param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]} cv = 3 """Инициализируем класс, который выполняет преобразование""" transform = PolynomialFeatures(2) """Сэмплируем данные из первой гауссианы""" data_0 = np.random.multivariate_normal([0,0], [[0.5,0],[0,0.5]], size=40) """И из второй""" data_1 = np.random.multivariate_normal([0,1], [[0.5,0],[0,0.5]], size=40) """На обучение берём 20 объектов из первого класса и 10 из второго""" example_data_train = np.vstack([data_0[:20,:], data_1[:10,:]]) example_labels_train = np.concatenate([np.zeros((20)), np.ones((10))]) """На тест - 20 из первого и 30 из второго""" example_data_test = np.vstack([data_0[20:,:], data_1[10:,:]]) """Обучаем преобразование на обучающей выборке, применяем его к тестовой""" example_data_train_poly = transform.fit_transform(example_data_train) example_data_test_poly = transform.transform(example_data_test) example_labels_test = np.concatenate([np.zeros((20)), np.ones((30))]) """Обращаем внимание на параметр fit_intercept=False""" optimizer = GridSearchCV(LogisticRegression(class_weight='balanced', fit_intercept=False), param_grid, cv=cv, n_jobs=-1) optimizer.fit(example_data_train_poly, example_labels_train) """Задаём координатную сетку, на которой будем вычислять область классификации""" xx, yy = np.meshgrid(np.arange(-3, 3, 0.02), np.arange(-3, 3, 0.02)) Z = optimizer.predict(transform.transform(np.c_[xx.ravel(), yy.ravel()])).reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel2) plt.scatter(data_0[:,0], data_0[:,1], color='red') plt.scatter(data_1[:,0], data_1[:,1], color='blue') plt.title('With class weights') plt.show() transform = PolynomialFeatures(15) example_data_train_poly = transform.fit_transform(example_data_train) example_data_test_poly = transform.transform(example_data_test) optimizer = GridSearchCV(LogisticRegression(class_weight='balanced', fit_intercept=False), param_grid, cv=cv, n_jobs=-1) optimizer.fit(example_data_train_poly, example_labels_train) Z = optimizer.predict(transform.transform(np.c_[xx.ravel(), yy.ravel()])).reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel2) plt.scatter(data_0[:,0], data_0[:,1], color='red') plt.scatter(data_1[:,0], data_1[:,1], color='blue') plt.title('Corrected class weights') plt.show() return #example_2() poly = PolynomialFeatures(2) X_train_real_zeros_poly = poly.fit_transform(X_train_real_zeros) X_test_real_zeros = poly.transform(X_test_real_zeros) scaler = StandardScaler() X_train_real_poly_scaled = scaler.fit_transform(X_train_real_zeros_poly) X_test_real_poly_scaled = scaler.transform(X_test_real_zeros) X_train_poly_scaled = np.hstack([X_train_real_poly_scaled, X_train_cat_oh]) X_test_poly_scaled = np.hstack([X_test_real_poly_scaled, X_test_cat_oh]) def task_5(): alg = LogisticRegression(class_weight='balanced', fit_intercept=False) params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]} grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1) grid.fit(X_train_poly_scaled, y_train) print(grid.best_params_, grid.best_score_) y_pred = grid.predict_proba(X_test_poly_scaled)[:, 1] auc = roc_auc_score(y_test, y_pred) print('ROC AUC on scaled y-stratified zeroes:', auc) write_answer_5(auc) return #task_5() def task_6(): alg = LogisticRegression(class_weight='balanced', penalty='l1') params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]} grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1) grid.fit(X_train_scaled, y_train) print(grid.best_params_, grid.best_score_) alg = grid.best_estimator_ zero_ids = np.where(alg.coef_[0,:X_train_real_scaled.shape[1]] == 0)[0] write_answer_6(zero_ids) return task_6() return
linreg1 = LinearRegression().fit(polytrain1 , y_train) x_pred1 = poly1.transform(x_pred) y_pred1 = linreg1.predict(x_pred1) final_array = np.array(y_pred1, ndmin=2) ''' # 6 9 x_pred = np.linspace(0, 10, 100).reshape(-1, 1) final_array = np.empty(shape=(4, 100)) for index, value in enumerate([1, 3, 6, 9]): poly = PolynomialFeatures(degree=value) poly_train = poly.fit_transform(X_train.reshape(-1, 1)) linreg = LinearRegression().fit(poly_train, y_train) x_pred_poly = poly.transform(x_pred) y_pred = linreg.predict(x_pred_poly) final_array[index] = y_pred plt.figure() plt.scatter(X_train, y_train) plt.scatter(X_test, y_test) plt.plot(x_pred, final_array[0], lw=3) plt.plot(x_pred, final_array[1], lw=3) plt.plot(x_pred, final_array[2], lw=3) plt.plot(x_pred, final_array[3], lw=3) plt.legend(['a', 'b', 'c', 'd', 'e']) # ANSWER 2 from sklearn.metrics.regression import r2_score
def runplt(): plt.figure() plt.title('匹萨价格与直径数据', fontproperties=font) plt.xlabel('直径(英寸)', fontproperties=font) plt.ylabel('价格(美元)', fontproperties=font) plt.axis([0, 25, 0, 25]) plt.grid(True) return plt plt = runplt() plt.plot(X_train, y_train, 'k.') quadratic_featurizer = PolynomialFeatures(degree=2) X_train_quadratic = quadratic_featurizer.fit_transform(X_train) X_test_quadratic = quadratic_featurizer.transform(X_test) regressor_quadratic = LinearRegression() regressor_quadratic.fit(X_train_quadratic, y_train) xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1)) plt.plot(xx, regressor_quadratic.predict(xx_quadratic), 'r-') seventh_featurizer = PolynomialFeatures(degree=7) X_train_seventh = seventh_featurizer.fit_transform(X_train) X_test_seventh = seventh_featurizer.transform(X_test) regressor_seventh = LinearRegression() regressor_seventh.fit(X_train_seventh, y_train) xx_seventh = seventh_featurizer.transform(xx.reshape(xx.shape[0], 1)) plt.plot(xx, regressor_seventh.predict(xx_seventh)) plt.show() print('二次回归 r-squared', regressor_quadratic.score(X_test_quadratic, y_test)) print('七次回归 r-squared', regressor_seventh.score(X_test_seventh, y_test))
Xtest = scaler.transform(xtest.reshape(-1, 1)) degs = np.arange(1, 21, 1) ndegs = np.max(degs) mse_train = np.empty(ndegs) mse_test = np.empty(ndegs) ytest_pred_stored = np.empty(ndegs, dtype=np.ndarray) ytrain_pred_stored = np.empty(ndegs, dtype=np.ndarray) for deg in degs: model = LinearRegression() poly_features = PolynomialFeatures(degree=deg, include_bias=False) Xtrain_poly = poly_features.fit_transform(Xtrain) model.fit(Xtrain_poly, ytrain) ytrain_pred = model.predict(Xtrain_poly) ytrain_pred_stored[deg - 1] = ytrain_pred Xtest_poly = poly_features.transform(Xtest) ytest_pred = model.predict(Xtest_poly) mse_train[deg - 1] = mse(ytrain_pred, ytrain) mse_test[deg - 1] = mse(ytest_pred, ytest) ytest_pred_stored[deg - 1] = ytest_pred # Plot MSE vs degree fig, ax = plt.subplots() mask = degs <= 15 ax.plot(degs[mask], mse_test[mask], color='r', marker='x', label='test') ax.plot(degs[mask], mse_train[mask], color='b', marker='s', label='train') ax.legend(loc='upper right', shadow=True) plt.xlabel('degree') plt.ylabel('mse') pml.savefig('polyfitVsDegree.pdf') plt.show()
# Feature Scaling scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train.values) X_val_scaled = scaler.transform(X_val.values) X_test_scaled = scaler.transform(X_test.values) # Model 2 lm_reg = Ridge(alpha=1) # Feature Transform poly = PolynomialFeatures(degree=2) X_train_poly = poly.fit_transform(X_train.values) X_val_poly = poly.transform(X_val.values) X_test_poly = poly.transform(X_test.values) # Model 3 lm_poly = LinearRegression() ############### 5.Choose Model ############### lm.fit(X_train, y_train) print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}') lm_reg.fit(X_train_scaled, y_train) print(f'Ridge Regression val R^2: {lm_reg.score(X_val_scaled, y_val):.3f}') lm_poly.fit(X_train_poly, y_train)
# polynomial model (degree=2) poly2 = PolynomialFeatures(degree=2) X_poly2 = poly2.fit_transform(X) poly2_fit = LinearRegression(fit_intercept=False) # polynomial model (degree=10) poly10 = PolynomialFeatures(degree=10) X_poly10 = poly10.fit_transform(X) poly10_fit = LinearRegression(fit_intercept=False) # option 1: one loop for everything (faster) # for plotting purposes x_linspace = np.linspace(np.min(X), np.max(X), 100) X_linspace = x_linspace.reshape(-1, 1) X_linspace_poly2 = poly2.transform(X_linspace) X_linspace_poly10 = poly10.transform(X_linspace) scores_linreg_fit = [] scores_poly2_fit = [] scores_poly10_fit = [] kf = KFold(n_splits=5) for train, test in kf.split(X): X_test, X_train = X[test], X[train] y_test, y_train = y[test], y[train] # subset training data X_lin_train = X[train] X_poly2_train = X_poly2[train] X_poly10_train = X_poly10[train]
y1 = np.array(temp_start_China).reshape(114, 1) clf.fit(x, y) clf1.fit(x1, y1) # 一次分析 year_set_China = year_set = list(range(2014, 2036)) # 将预测的年份 predict_temp = clf.predict([[i] for i in year_set]) # 按线性回归方程估计计算未来温度 predict_temp_China = clf1.predict([[i] for i in year_set_China ]) # 按线性回归方程估计计算中国未来温度 predict_temp_list = [i[0] for i in predict_temp] # 化为一维列表 predict_temp_China_list = [i[0] for i in predict_temp_China] # 化为一维列表 ploy = PolynomialFeatures(degree=2) # 设置为2次项,多项式预测 x_ploy = ploy.fit_transform(x) clf_ = LinearRegression() # 设置二次线性实例 clf_.fit(x_ploy, y) x_ployed = ploy.transform(x) y_predict = clf_.predict(x_ployed) predict_temp_two_list = [i[0] for i in y_predict] # 化为一维列表 x, x1, x_dimensional = [], [], [] # 空列表存温度 for i in range(0, 22): x.append( opts.LineItem(name=year_set[i], value=round(predict_temp_list[i], 2), itemstyle_opts=opts.ItemStyleOpts(color='purple'))) for i in range(0, 22): x1.append( opts.LineItem(name=year_set_China[i], value=round(predict_temp_China_list[i], 2), itemstyle_opts=opts.ItemStyleOpts(color='blue'))) for i in range(0, 22):
x_train = [[6], [8], [10], [14], [18]] y_train = [[7], [9], [13], [17.5], [18]] x_test = [[6], [8], [11], [16]] y_test = [[8], [12], [15], [18]] regressor = LinearRegression() regressor.fit(x_train, y_train) xx = np.linspace(0, 26, 100) yy = regressor.predict(xx.reshape(xx.shape[0], 1)) plt.plot(xx, yy) quadratic_featurizer = PolynomialFeatures(degree=2) x_train_quadratic = quadratic_featurizer.fit_transform(x_train) x_test_quadratic = quadratic_featurizer.transform(x_test) regressor_quadratic = LinearRegression() regressor_quadratic.fit(x_train_quadratic, y_train) xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1)) print(x_train) print(x_train_quadratic) print(x_test) print(x_test_quadratic) print('Simple linear regression r-squared', regressor.score(x_test, y_test)) print('Quadratic regression r-squared', regressor_quadratic.score(x_test_quadratic, y_test))
# 多元线性回归: model = LinearRegression() model.fit(X, y) X_test = [[8, 2], [9, 0], [11, 2], [16, 2], [12, 0]] y_test = [[11], [8.5], [15], [18], [11]] predictions = model.predict(X_test) for i, prediction in enumerate(predictions): print('Predicted: %s, Target: %s' % (prediction, y_test[i])) print('R-squared: %.2f' % model.score(X_test, y_test)) # 多元多项式回归 X_train = [[6, 2], [8, 1], [10, 0], [14, 2], [18, 0]] y_train = [[7], [9], [13], [17.5], [18]] quadratic_featurizer = PolynomialFeatures(degree=2) X_train_quadratic = quadratic_featurizer.fit_transform(X_train) regressor_quadratic = LinearRegression() # 训练 regressor_quadratic.fit(X_train_quadratic, y_train) xx_quadratic = quadratic_featurizer.transform([[6, 3]]) print("预测结果: {}".format(regressor_quadratic.predict(xx_quadratic))) def main(): pass if __name__ == '__main__': main()
X2 = [[0], [10], [14], [25]] model = LinearRegression() model.fit(X, y) print 'A 12" pizza should cost: $%.2f' % model.predict([12])[0] y2 = model.predict(X2) plt.scatter(X, y) # plt.plot(X2, y2, 'r-') import numpy as np poly = PolynomialFeatures(degree=9) X_p = poly.fit_transform(X) print len(X) xx = np.linspace(0, 26, 1000) regressor_p = LinearRegression() regressor_p.fit(X_p, y) print xx.shape xx_p = poly.transform(xx.reshape(xx.shape[0], 1)) plt.plot(xx, regressor_p.predict(xx_p), c='r') ################# Sample 3 ################# """ >>> import numpy as np >>> print 'Residual sum of squares: %.2f' % np.mean((model.predict(X) - y) ** 2) Residual sum of squares: 1.75 """ import numpy as np print 'Residual sum of squares: %.2f' % np.mean((model.predict(X) - y) ** 2) ################# Sample 4 ################# """
dftagdata = dfdata[FeatureCols] dffaildata = dfdata[faildatacols] scaler = StandardScaler().fit(dftagdata) dftagdata = scaler.transform(dftagdata) x_train, x_test, y_train, y_test = train_test_split(dftagdata, dffaildata, random_state=0) y_train = np.array(y_train) y_test = np.array(y_test) for d in deg: print('Degree - ' + str(d)) poly = PolynomialFeatures(degree=d) x_train_poly = poly.fit_transform(x_train) x_test_poly = poly.transform(x_test) pln = LogisticRegression(C=C1, max_iter=miter).fit(x_train_poly, y_train) print(idx + ' Train Score - ' + str(pln.score(x_train_poly, y_train)) + ' Test Score - ' + str(pln.score(x_test_poly, y_test))) print(pln.predict(x_train_poly)) print(y_train.reshape(1, -1)[0]) print(pln.predict(x_test_poly)) print(y_test.reshape(1, -1)[0]) plot_graph(df, tag=FeatureCols, version=['std']) #plot_graph(df3, tag = ['5PT1B2', '5PT3B2', '5TT1B2'],version=['max', 'min']) #plot_graph(df3, tag = ['5PT1B2', '5PT3B2', '5PT2C1', '5PT3C1', '5PT2G1','5PT3G1', '5PT1H1', '5PT4H1', '5TT1B2', '5TT3B2', '5TT2C1', '5TT3C1','5TT2G1', '5TT3G1', '5TT1H1', '5TT4H1'],version=['max', 'min']) #plot_graph(df3, tag = ['20PICP2Choke', '20PICP1Choke','20PT17FLDC', '20PT18FLDC', '20PT27FLDC', '20PT28FLDC', '20PT214FLLA', '20PT224FLRE', '20TT115FLT1', '20TT125FLTS', '20TT215T2FLL', '20TT225T2FLL', '20ZT114SSFL', '20ZT124SSFL', '20ZT214T2FL', '20ZT224T2FL', '21FQI10518NR', '21FT40518D', '21FT40518GVFR', '21HY10535OFL', '21HY40534OTSL', '21LIC10516SP', '21LIC10620CVH', '21LIC10620SPH', '21LIC40516SPTA', '21LT10515PVPSO', '21LT10516PVPSO', '21LT10618PVPSO2', '21LT10620PVPSO2', '21LT40515PVTA', '21LT40516PVTA', '21LY10516OPSO2', '21LY10616OPSO2', '21LY10620OSH2', '21LY11516OTT', '21LY40516OUT', '21PT10505PVPS', '21PT10605PVPS2', '21PT40505PVTA', '21TT10508PVPSO', '21TT10608PVPSO2', '21TT11616PVOTHO', '30FT19107PVSH2', '30FT19108PV', '30FT29108PV', '30FT69521PVFCP', '30LIC69516CVFCO', '30LIC69518CVFCP', '30LT69514PVFC', '30LT69515PVFC', '30LT69516PVFC', '30LT69518PVFC', '30LY69518OFCP', '30PDIC19104SPPHO', '30PDT19104PVSH2', '30PDT19104PVSHS2', '30PDT19104PVSHD2', '30PDY19104OSPH2', '30PT69503PVFC', '30PT69512PV', '30PY69503OFCO', '37PT62301PVCS' ],version=['max', 'min']) #
if __name__ == "__main__": x,y = get_data() # Divide the data into Train, dev and test x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,test_size = 0.3,random_state=9) x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9) #Prepare some polynomial features poly_features = PolynomialFeatures(2,interaction_only=True) poly_features.fit(x_train) x_train_poly = poly_features.transform(x_train) x_dev_poly = poly_features.transform(x_dev) # Build model with polynomial features model_poly = build_model(x_train_poly,y_train) predicted_y = model_poly.predict(x_train_poly) print "\n Model Performance in Training set (Polynomial features)\n" model_worth(y_train,predicted_y) # View model details view_model(model_poly) # Apply the model on dev set predicted_y = model_poly.predict(x_dev_poly) print "\n Model Performance in Dev set (Polynomial features)\n" model_worth(y_dev,predicted_y)
for l in list: t = pd.DataFrame(l, columns=['year', 'sale']) X = t['year'].values.reshape(-1, 1) y = t['sale'] pf = PolynomialFeatures(degree=3) pf.fit(X) Xp = pf.fit_transform(X) lr = LinearRegression() lr.fit(Xp, y) if l == double11: test = [[11, 2684]] tt = pd.DataFrame(test, columns=['year', 'sale']) Xt = tt['year'].values.reshape(-1, 1) pred = lr.predict(pf.transform(Xt)) print('{}{}{}{}{}{}'.format('双十一R2:', lr.score(Xp, y), ' 预测 ', pred, ' 实际:2684 误差: ', (pred - 2684) / 2684)) if l == Thanksgiving: test = [[10, 1572]] tt = pd.DataFrame(test, columns=['year', 'sale']) Xt = tt['year'].values.reshape(-1, 1) pred = lr.predict(pf.transform(Xt)) print('{}{}{}{}{}{}'.format('感恩节R2:', lr.score(Xp, y), ' 预测 ', pred, ' 实际:1572 误差: ', (pred - 1572) / 1572)) if l == Blackfriday: test = [[10, 2360]] tt = pd.DataFrame(test, columns=['year', 'sale']) Xt = tt['year'].values.reshape(-1, 1) pred = lr.predict(pf.transform(Xt)) print('{}{}{}{}{}{}'.format('黑五节R2:', lr.score(Xp, y), ' 预测 ', pred,
for epoch in range(n_epochs): for batch_index in range(n_batches): X_batch, y_batch = random_batch(X_train, y_train, batch_size) sess.run(train_op, feed_dict={X: X_train, y:y_train}) if epoch %100 == 0: loss_val = loss.eval({X: X_train, y:y_train}) loss_val1 = loss1.eval({X: X_train, y:y_train}) print("Epoch:", epoch, "\tLoss:", loss_val, "\tLoss1:", loss_val1) best_w = w.eval() return best_w w_tf = logistic_regression(X_train_tf, y_train_tf) def log_prob(X,w): score = np.dot(X,w) prob = 1/(1 + np.exp(-score)) return prob y_prob_tf = log_prob(X_test_tf, w_tf) y_pred_tf = (y_prob_tf > 0.5).astype(int) print(accuracy_score(y_test, y_pred_tf)) from sklearn.preprocessing import PolynomialFeatures pf = PolynomialFeatures(degree=3) X_train_tf2 = pf.fit_transform(X_train) X_test_tf2 = pf.transform(X_test) w_tf2 = logistic_regression(X_train_tf2, y_train_tf) y_prob_tf2 = log_prob(X_test_tf2, w_tf2) y_pred_tf2 = (y_prob_tf2 > 0.5).astype(int) print(accuracy_score(y_test, y_pred_tf2))
plt.ylabel('Claims Paid') plt.show() # Fitting Polynomial Regression to the dataset from sklearn.preprocessing import PolynomialFeatures poly_object = PolynomialFeatures(degree = 5) features_poly = poly_object.fit_transform(features) lin_reg_2 = LinearRegression() lin_reg_2.fit(features_poly, labels) #lin_reg_2.predict([1981]) not in degree 5 print ("Predicting result with Polynomial Regression") print (lin_reg_2.predict(poly_object.transform([1981])) #converting to degree 5 # Visualising the Polynomial Regression results plt.scatter(features, labels, color = 'red') plt.plot(features, lin_reg_2.predict(poly_object.fit_transform(features)), color = 'blue') plt.title('Polynomial Regression') plt.xlabel('Year') plt.ylabel('Claims Paid') plt.show() """ https://towardsdatascience.com/polynomial-regression-bbe8b9d97491 """
# Define targets targets = [CASUAL, REGISTERED] # CV n_folds = 10 rmsle_fold = np.zeros(n_folds) skf = KFold(y.shape[0], n_folds, shuffle=True, random_state=0) i = 0 for train, test in skf: X_train, X_test = X.loc[train, :], X.loc[test, :] y_train, y_test = y.loc[train], y.loc[test] # Transform polynomial base functions poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=True) X_train = poly.fit_transform(X_train) X_test = poly.transform(X_test) # Train models model = {} for target in targets: # clf[target] = Lasso(random_state=0, alpha=1.0, normalize=True, max_iter=1000, tol=0.0001, positive=False, # selection='cyclic') model[target] = Ridge(random_state=0, alpha=1.0, normalize=True, max_iter=None, tol=0.001, solver='auto') model[target].fit(X_train, y_train[target]) # Predict and clip y_pred = model[CASUAL].predict(X_test).clip(min=0) + model[REGISTERED].predict(X_test).clip(min=0) # Evaluate rmsle_fold[i] = rmsle(y_test[COUNT], y_pred) print 'Fold %d/%d, RMSLE = %f' % (i + 1, n_folds, rmsle_fold[i])
feat_train = sc.fit_transform(feat_train) feat_test = sc.transform(feat_test) ''' ''' from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(random_state=0) regressor.fit(feat_train , labels_train) labels_pred = regressor.predict(feat_test) ''' from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures import numpy as np poln_object = PolynomialFeatures(degree=2) feat_train_poln = poln_object.fit_transform(feat_train) feat_test_poln = poln_object.transform(feat_test) feat_test_poln_abs = poln_object.transform( np.array([24, 23, 91, 1007, 5, 19, 33, 25, 55, 999, 3, 6]).reshape(1, -1)) lin_reg_2 = LinearRegression() lin_reg_2.fit(feat_train_poln, labels_train) import numpy as np lin_2_reg_pred = lin_reg_2.predict(feat_test_poln) lin_2_reg_pred2 = lin_reg_2.predict(feat_test_poln_abs) score = lin_reg_2.score(feat_test_poln, labels_test) from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator=lin_reg_2, X=feat_train_poln, y=labels_train, cv=10) print("mean accuracy is", accuracies.mean())
plt.show() pf = PolynomialFeatures(degree=6) reg = LogisticRegression(C=10) pipeline = Pipeline([("polynomial_features", pf), ("logistic_regression", reg)]) pipeline.fit(X, f_y) theta = reg.coef_.T u = np.linspace(-1, 1.5, 50) v = np.linspace(-1, 1.5, 50) X1, X2 = np.meshgrid(u, v) X1, X2 = X1.reshape(-1, 1), X2.reshape(-1, 1) temp_X = pf.transform(np.hstack((X1, X2))) z = np.dot(temp_X, theta).reshape(len(u), len(v)) plt.plot(x1[f_y == 0], x2[f_y == 0], 'yo') plt.plot(x1[f_y == 1], x2[f_y == 1], 'bx') CS = plt.contour(u, v, z) plt.clabel(CS, inline=1, fontsize=10) plt.show() for lambda_coef in (0.03, 0.3, 0.1, 1, 3, 10): c = 1.0/lambda_coef pipeline.set_params(**{'logistic_regression__C': c}) pipeline.fit(X, y.ravel()) print 'lambda = {}, Train Accuracy: {}'.format(lambda_coef, pipeline.score(X, y)*100)
class Config: def __init__(self, adict, pc): for k, v in adict.items(): setattr(self, k, v) self.name = ", ".join(["%s: %s" % x for x in self.__dict__.items()]) self.datetime = str(datetime.now()) self.vals = adict self.learner = self.__class__.__name__.replace("Config", "") self.pc = pc # for SVR, allow feature selection only for linear models if (self.name in ["SVRpoly", "SVRrbf", "SVRsigmoid"] and self.pc['feature_selection'] > 0): raise Exception( "For non-linear SVR, cannot use feature selection!") if pc['poly_degree'] > 0: self.poly_features = PolynomialFeatures(pc['poly_degree'], interaction_only=True) else: self.poly_features = None def __str__(self): return self.name def train(self, data): if self.pc['rfe_step'] != 0 and self.pc['feature_selection'] > 0: model = self.rfe_fit(data) else: model = self.fit(data) return model def fit(self, data): if isinstance(self, (ConfigGB, ConfigXGBoost)) and self.early_stopping: model = self.init_model(early_stopping=self.early_stopping, num_train=data.trainX.shape[0]) x = np.concatenate((data.trainX, data.valX)) y = np.concatenate((data.trainY, data.valY)) else: model = self.init_model() x, y = data.trainX, data.trainY if self.poly_features: LOGGER.info("Fitting polynomial features ...") model.fit(self.poly_features.fit_transform(x), y) else: model.fit(x, y) return model def rfe_fit(self, data): """Recursive feature elimination """ if isinstance(self, (ConfigGB, ConfigXGBoost)): model = self.init_model(early_stopping=self.early_stopping, num_train=data.trainX.shape[0]) else: model = self.init_model() num = int(data.trainX.shape[1] * self.pc['feature_selection']) if num < 1: raise Exception(f"There will be {num} after selection!, " + "change the feature_selection setting") LOGGER.debug("RFE will select %d features" % num) step = self.pc['rfe_step'] selector = RFE(model, n_features_to_select=num, step=step) if isinstance(self, (ConfigGB, ConfigXGBoost)) and self.early_stopping: x = np.concatenate((data.trainX, data.valX)) y = np.concatenate((data.trainY, data.valY)) return selector.fit(x, y) else: return selector.fit(data.trainX, data.trainY) def forecast(self, model, testX): """Return forecasts for all horizons up to `horizon` """ results = [] horizon = self.pc['horizon'] lags = self.pc['lags'] #LOGGER.debug("Forecasting test: %s" % testX) n_forecast_success = 0 n_forecast_errors = 0 first_forecast_error = None for i in range(len(testX) - horizon + 1): for j in range(horizon): instance = testX[i + j] if j == 0: buf = [] else: # insert the buffer into instance buf_len = len(buf) if buf_len > lags: buf = buf[-lags:] buf_len = lags start = lags - buf_len end = start + buf_len instance = np.concatenate( (instance[:start], buf, instance[end:])) if self.poly_features: poly_instance = self.poly_features.transform( instance.reshape(1, -1)) pred_val = model.predict(poly_instance)[0] else: pred_val = model.predict(instance.reshape(1, -1))[0] if np.isnan(pred_val): #warnings.warn("Error forecasting %s, returning 0.5" # % instance.tolist()) if first_forecast_error is None: first_forecast_error = instance.tolist() n_forecast_errors += 1 pred_val = 0.5 else: n_forecast_success += 1 buf.append(pred_val) #LOGGER.debug("Predicting with instance %s, result %s" % # (instance, pred_val)) results.append(pred_val) if first_forecast_error is not None: LOGGER.info(f"{n_forecast_success} successes and ", f"{n_forecast_errors} errors during forecasting") LOGGER.debug(f"First instance with error: {first_forecast_error}") return np.array(results).reshape(-1, 1)
def learning_curve_old(data, feature_cols, target_col, classifier, train_sizes, test_sizes=200000, random_state=None, balanced=True, normalise=True, degree=1, pickle_path=None): """ Compute the learning curve of a classiifer. Parameters ---------- data : DataFrame The DataFrame containing all the data. feature_cols : array A list of column names in data that are used as features. target_col : str The column name of the target. classifier : Classifier object A classifier object that will be used to train and test the data. It should have the same interface as scikit-learn classifiers. train_sizes : array The list of the sample sizes that the classifier will be trained on. test_sizes : int or list of ints The sizes of the test set. random_state : int The value of the Random State (used for reproducibility). normalise : boolean Whether we should first normalise the data to zero mean and unit variance. degree : int If greater than 1, the data will first be polynomially transformed with the given degree. pickle_path : str The path where the values of the learning curve will be saved. Returns ------- lc_accuracy_test : array The list of balanced accuracy scores for the given sample sizes. """ lc_accuracy_test = [] if type(test_sizes) is int: test_sizes = [test_sizes] * len(train_sizes) for i, j in zip(train_sizes, test_sizes): gc.collect() # split data into test set and training set if balanced: X_train, X_test, y_train, y_test = balanced_train_test_split( data[feature_names], data[class_name], train_size=i, test_size=j, random_state=random_state) else: X_train, X_test, y_train, y_test = train_test_split(np.array(data[feature_cols]), np.array(data[target_col]), train_size=i, test_size=j, random_state=random_state) X_train, y_train = shuffle(X_train, y_train, random_state=random_state*2) X_test, y_test = shuffle(X_test, y_test, random_state=random_state*3) if normalise: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) if degree > 1: poly_features = PolynomialFeatures(degree=degree, interaction_only=False, include_bias=True) X_train = poly_features.fit_transform(X_train) X_test = poly_features.transform(X_test) # train the classifier classifier.fit(X_train, y_train) # apply classifier on test set y_pred_test = classifier.predict(X_test) confusion_test = metrics.confusion_matrix(y_test, y_pred_test) lc_accuracy_test.append(balanced_accuracy_expected(confusion_test)) # pickle learning curve if pickle_path: with open(pickle_path, 'wb') as f: pickle.dump(lc_accuracy_test, f, protocol=4) return lc_accuracy_test
class Classifier: # def __init__(self, val_size, feature_cols, label_col, feature_degree, # feature_scaling, including_classes, preprocess, shuffle): def __init__(self, **params): self.val_size = params.pop('val_size') self.feature_degree = params.pop('feature_degree') self.feature_scaling = params.pop('feature_scaling') self.including_classes = params.pop('including_classes') self.add_cluster_features = params.pop('add_cluster_features') self.shuffle = params.pop('shuffle') and self.add_cluster_features self.random_state = params.pop('random_state') self.model_params = params self.his = { 'acc': None, 'loss': None, 'val_acc': None, 'val_loss': None } self.model = None self.cm = None self.score = 0 def preprocess_data(self, X, y): y = y.astype('int') self.le = LabelEncoder() y = self.le.fit_transform(y) self.num_labels = max(y) + 1 self.labels = [i for i in range(self.num_labels)] self.labels_origin = self.le.inverse_transform(self.labels).tolist() X_train, X_val, y_train, y_val = split_data(X, y, self.val_size, self.shuffle, self.random_state) if self.add_cluster_features: X_train, y_train = add_cluster_features(X_train, y_train) X_val, y_val = add_cluster_features(X_val, y_val) self.poly = PolynomialFeatures(self.feature_degree, include_bias=False) X_train = self.poly.fit_transform(X_train) if len(X_val) > 0: X_val = self.poly.transform(X_val) self.num_samples, self.num_features = X_train.shape self.sc = StandardScaler() if self.feature_scaling: X_train = self.sc.fit_transform(X_train) if len(X_val) > 0: X_val = self.sc.transform(X_val) return X_train, X_val, y_train, y_val def preprocess_X(self, X): if self.add_cluster_features: X = add_features(X) X = self.poly.transform(X) if self.feature_scaling: X = self.sc.transform(X) return X def evaluate_helper(self, X, y, radius, verbose): prob = self.model.predict_proba(X) try: loss = log_loss(y, prob) except: loss = -1 pred = self.model.predict(X=X) accuracy = np.count_nonzero(y == pred) / len(y) if verbose: print('Accuracy: ', accuracy * 100, ' Loss: ', loss) pred = smoothen(pred, radius) accuracy = np.count_nonzero(y == pred) / len(y) if verbose and radius > 0: print('Accuracy after smoothening with radius =', radius, ': ', accuracy * 100) self.cm = confusion_matrix(y, pred, labels=self.labels) return accuracy, loss def evaluate_test(self, radius=0, verbose=False): if len(self.X_val) > 0: if verbose: print('\nEvaluating on test set...') X = self.X_val y = self.y_val accuracy, loss = self.evaluate_helper(X, y, radius, verbose) self.score = accuracy * 100 def evaluate(self, X, y, radius=0, verbose=False): X, y = filt_data(X, y, self.including_classes) X = self.preprocess_X(X) y = self.le.transform(y) accuracy, loss = self.evaluate_helper(X, y, radius, verbose) return {'acc': accuracy, 'loss': loss} def judge(self, X, y, radius=0, verbose=False, threshold=0.8): X, y = filt_data(X, y, self.including_classes) X = self.preprocess_X(X) y = self.le.transform(y) prob = self.model.predict_proba(X) pred = prob.argmax(axis=1) confidence = np.max(prob, axis=1) pred = judge(pred, prob, threshold=threshold) pred = smoothen(pred, radius) cm = confusion_matrix(y, pred, labels=self.labels + [-9999]) return cm def probability(self, X): X = self.preprocess_X(X) return self.model.predict_proba(X) def predict(self, X=None, radius=0, threshold=0.0): X = self.preprocess_X(X) prob = self.model.predict_proba(X) pred = prob.argmax(axis=1) confidence = np.max(prob, axis=1) pred = self.le.inverse_transform(pred) pred = judge(pred, confidence, threshold=threshold, null_type=None) pred = smoothen(pred, radius) return pred def get_result(self, X, radius=0, threshold=0.0): X = self.preprocess_X(X) prob = self.model.predict_proba(X) pred = prob.argmax(axis=1) confidence = np.max(prob, axis=1) pred = self.le.inverse_transform(pred) pred = judge(pred, confidence, threshold=threshold, null_type=None) pred = smoothen(pred, radius) cum_prob = cumulate_prob(prob) return dict(target=pred.tolist(), prob=cum_prob.tolist()) def get_cm_data_url(self, id): if self.cm is None: return None draw_confusion_matrix(self.cm, self.labels_origin + ['']) img = id + '.png' plt.savefig(img) data_url = image_to_data_url(img) os.remove(img) return data_url @classmethod def load(Classifier, file_name): return joblib.load(file_name)
# # Теперь рассмотрим способы преобразования признаков. Существует достаточно много различных способов трансформации признаков, которые позволяют при помощи линейных методов получать более сложные разделяющие поверхности. Самым базовым является полиномиальное преобразование признаков. Его идея заключается в том, что помимо самих признаков вы дополнительно включаете набор все полиномы степени $p$, которые можно из них построить. Для случая $p=2$ преобразование выглядит следующим образом: # # $$ \phi(x_i) = [x_{i,1}^2, ..., x_{i,D}^2, x_{i,1}x_{i,2}, ..., x_{i,D}, x_{i,D-1}, x_{i,1}, ..., x_{i,D}, 1] $$ # # Рассмотрим принцип работы данных признаков на данных, сэмплированных их гауссиан: # In[ ]: from sklearn.preprocessing import PolynomialFeatures """Инициализируем класс, который выполняет преобразование""" transform = PolynomialFeatures(2) """Обучаем преобразование на обучающей выборке, применяем его к тестовой""" example_data_train_poly = transform.fit_transform(example_data_train) example_data_test_poly = transform.transform(example_data_test) """Обращаем внимание на параметр fit_intercept=False""" optimizer = GridSearchCV(LogisticRegression(class_weight='balanced', fit_intercept=False), param_grid, cv=cv, n_jobs=-1) optimizer.fit(example_data_train_poly, example_labels_train) Z = optimizer.predict(transform.transform(np.c_[xx.ravel(), yy.ravel()])).reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel2) plt.scatter(data_0[:,0], data_0[:,1], color='red') plt.scatter(data_1[:,0], data_1[:,1], color='blue') plt.title('With class weights') plt.show() # Видно, что данный метод преобразования данных уже позволяет строить нелинейные разделяющие поверхности, которые могут более тонко подстраиваться под данные и находить более сложные зависимости. Число признаков в новой модели: # In[ ]:
from sklearn.preprocessing import StandardScaler sc = StandardScaler() feat_train = sc.fit_transform(feat_train) feat_test = sc.transform(feat_test) ''' ''' from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(random_state=0) regressor.fit(feat_train , labels_train) labels_pred = regressor.predict(feat_test) ''' from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures import numpy as np poln_object = PolynomialFeatures(degree = 2) feat_train_poln = poln_object.fit_transform(feat_train) feat_test_poln = poln_object.transform(feat_test) feat_test_poln_abs = poln_object.transform(np.array([24,23,91,1007,5,19,33,25,55,999,3,6]).reshape(1,-1)) lin_reg_2 = LinearRegression() lin_reg_2.fit(feat_train_poln,labels_train) import numpy as np lin_2_reg_pred = lin_reg_2.predict(feat_test_poln) lin_2_reg_pred2 = lin_reg_2.predict(feat_test_poln_abs) score = lin_reg_2.score(feat_test_poln , labels_test) from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator = lin_reg_2, X = feat_train_poln, y = labels_train, cv = 10) print ("mean accuracy is",accuracies.mean()) print (accuracies.std())