Пример #1
0
 def get_loss(self, X_valid, Y_valid):
     """
     计算这一个样例的相关系数
     """
     Y_valid = Y_valid.reshape(-1)
     y_pred = self.predict(X_valid)
     logger.debug('y_pred : shape{}'.format(y_pred.shape))
     logger.debug('Y_valid : shape{}'.format(Y_valid.shape))
     return pearson_correlation(y_pred, Y_valid)
Пример #2
0
train_Y = train_ori_Y.values

# # 交叉验证

n_splits = 3
cv = ShuffleSplit(n_splits=n_splits)
for train_indices, test_indices in cv.split(train_X):
    lr = LinearRegression(learning_rate=0.01, num_iterations=3000)
    # just for test
    #     lr = LinearSVR()
    #     lr = MLPRegressor(hidden_layer_sizes=50)
    #     lr.fit(train_X[train_indices], train_Y[train_indices], watch=True)
    lr.fit(train_X[train_indices], train_Y[train_indices], watch=True)
    y_pred = lr.predict(train_X[test_indices])
    print(pearson_correlation(y_pred, train_Y[test_indices]))

# # 训练模型写入结果

lr = LinearRegression(learning_rate=0.01, num_iterations=3000)
lr.fit(train_X, train_Y, watch=True)

y_pred = lr.predict(test_X)

sns.distplot(train_Y)

sns.distplot(y_pred)

sub = pd.DataFrame(y_pred)
sub.to_csv('../results/' + 'LinearRegression-0.01-3000-' +
           str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + ".csv",
Пример #3
0
# 特征方案0:不设置任何特征
train_X_feat = train_ori_X
test_X_feat = test_ori_X

# 特征方案1:增加占比特征,不抛弃原有特征
train_X_feat = get_proportion_feature_1(train_ori_X)
test_X_feat = get_proportion_feature_1(test_ori_X)

train_X_feat.columns

# 查看不同特征与分数的相关系数
for feat_name in train_X_feat:
    print("{} : {}".format(
        feat_name,
        pearson_correlation(train_X_feat[feat_name].values,
                            train_ori_Y.values)))

drop_feature_names = [
    'Total_Number_of_Reviews_Reviewer_Has_Given', 'with_pet_score'
]
for drop_feature_name in drop_feature_names:
    train_X_feat = train_X_feat.drop(labels=drop_feature_name, axis=1)
    test_X_feat = test_X_feat.drop(labels=drop_feature_name, axis=1)

# 查看不同特征与分数的相关系数
for feat_name in train_X_feat:
    print("{} : {}".format(
        feat_name,
        pearson_correlation(train_X_feat[feat_name].values,
                            train_ori_Y.values)))
Пример #4
0
 def get_test_cost(self, X, Y):
     Y_pred = self.predict(X)
     return pearson_correlation(Y_pred, Y)
Пример #5
0
# # 构造训练集和测试集,并归一化

# 特征方案0:不设置任何特征
train_X_feat = train_ori_X
test_X_feat = test_ori_X

# 特征方案1:增加占比特征,不抛弃原有特征
train_X_feat = get_proportion_feature_1(train_ori_X)
test_X_feat = get_proportion_feature_1(test_ori_X)

train_X_feat.columns

# 查看不同特征与分数的相关系数
for feat_name in train_X_feat:
    print("{} : {}".format(feat_name, pearson_correlation(train_X_feat[feat_name].values, train_ori_Y.values)))

# 方案一:没有权重
ss = StandardScaler()
train_X = ss.fit_transform(train_X_feat.values)
test_X = ss.transform(test_X_feat.values)

# 方案二:设置部分列的权重
ss = StandardScaler()
train_X = ss.fit_transform(train_X_feat.values)
test_X = ss.transform(test_X_feat.values)
# 增加某些特征的权重
train_X[:,1] *= 2
train_X[:,2] *= 2
train_X[:,4] *= 2
Пример #6
0
 def test_pearson_correlation_single_feature(self):
     y_pred = np.array([1,2,3,4,5,6,3,1])
     y_true = np.array([1,2,3,4,5,6,4,1])
     # 默认,要7位有效数字都要相同
     self.assertAlmostEqual(pearson_correlation(y_true,y_pred),0.98122119)
Пример #7
0
 def test_pearson_correlation_multi_feature(self):
     y_pred = np.array([[1,2,3,4,5,6,3,1],[1,2,3,4,5,6,4,1]]).T
     y_true = np.array([[1,2,3,4,5,6,4,1],[1,2,3,4,5,6,3,1]]).T
     # 默认,要7位有效数字都要相同
     self.assertAlmostEqual(pearson_correlation(y_true,y_pred),0.98122119)