def get_loss(self, X_valid, Y_valid): """ 计算这一个样例的相关系数 """ Y_valid = Y_valid.reshape(-1) y_pred = self.predict(X_valid) logger.debug('y_pred : shape{}'.format(y_pred.shape)) logger.debug('Y_valid : shape{}'.format(Y_valid.shape)) return pearson_correlation(y_pred, Y_valid)
train_Y = train_ori_Y.values # # 交叉验证 n_splits = 3 cv = ShuffleSplit(n_splits=n_splits) for train_indices, test_indices in cv.split(train_X): lr = LinearRegression(learning_rate=0.01, num_iterations=3000) # just for test # lr = LinearSVR() # lr = MLPRegressor(hidden_layer_sizes=50) # lr.fit(train_X[train_indices], train_Y[train_indices], watch=True) lr.fit(train_X[train_indices], train_Y[train_indices], watch=True) y_pred = lr.predict(train_X[test_indices]) print(pearson_correlation(y_pred, train_Y[test_indices])) # # 训练模型写入结果 lr = LinearRegression(learning_rate=0.01, num_iterations=3000) lr.fit(train_X, train_Y, watch=True) y_pred = lr.predict(test_X) sns.distplot(train_Y) sns.distplot(y_pred) sub = pd.DataFrame(y_pred) sub.to_csv('../results/' + 'LinearRegression-0.01-3000-' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + ".csv",
# 特征方案0:不设置任何特征 train_X_feat = train_ori_X test_X_feat = test_ori_X # 特征方案1:增加占比特征,不抛弃原有特征 train_X_feat = get_proportion_feature_1(train_ori_X) test_X_feat = get_proportion_feature_1(test_ori_X) train_X_feat.columns # 查看不同特征与分数的相关系数 for feat_name in train_X_feat: print("{} : {}".format( feat_name, pearson_correlation(train_X_feat[feat_name].values, train_ori_Y.values))) drop_feature_names = [ 'Total_Number_of_Reviews_Reviewer_Has_Given', 'with_pet_score' ] for drop_feature_name in drop_feature_names: train_X_feat = train_X_feat.drop(labels=drop_feature_name, axis=1) test_X_feat = test_X_feat.drop(labels=drop_feature_name, axis=1) # 查看不同特征与分数的相关系数 for feat_name in train_X_feat: print("{} : {}".format( feat_name, pearson_correlation(train_X_feat[feat_name].values, train_ori_Y.values)))
def get_test_cost(self, X, Y): Y_pred = self.predict(X) return pearson_correlation(Y_pred, Y)
# # 构造训练集和测试集,并归一化 # 特征方案0:不设置任何特征 train_X_feat = train_ori_X test_X_feat = test_ori_X # 特征方案1:增加占比特征,不抛弃原有特征 train_X_feat = get_proportion_feature_1(train_ori_X) test_X_feat = get_proportion_feature_1(test_ori_X) train_X_feat.columns # 查看不同特征与分数的相关系数 for feat_name in train_X_feat: print("{} : {}".format(feat_name, pearson_correlation(train_X_feat[feat_name].values, train_ori_Y.values))) # 方案一:没有权重 ss = StandardScaler() train_X = ss.fit_transform(train_X_feat.values) test_X = ss.transform(test_X_feat.values) # 方案二:设置部分列的权重 ss = StandardScaler() train_X = ss.fit_transform(train_X_feat.values) test_X = ss.transform(test_X_feat.values) # 增加某些特征的权重 train_X[:,1] *= 2 train_X[:,2] *= 2 train_X[:,4] *= 2
def test_pearson_correlation_single_feature(self): y_pred = np.array([1,2,3,4,5,6,3,1]) y_true = np.array([1,2,3,4,5,6,4,1]) # 默认,要7位有效数字都要相同 self.assertAlmostEqual(pearson_correlation(y_true,y_pred),0.98122119)
def test_pearson_correlation_multi_feature(self): y_pred = np.array([[1,2,3,4,5,6,3,1],[1,2,3,4,5,6,4,1]]).T y_true = np.array([[1,2,3,4,5,6,4,1],[1,2,3,4,5,6,3,1]]).T # 默认,要7位有效数字都要相同 self.assertAlmostEqual(pearson_correlation(y_true,y_pred),0.98122119)