-
Notifications
You must be signed in to change notification settings - Fork 0
/
usedMain.py
320 lines (278 loc) · 14.2 KB
/
usedMain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
import datetime
from hmmlearn.hmm import GaussianHMM
from matplotlib.pylab import style
from scipy.stats.stats import pearsonr
from sklearn import model_selection
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
style.use('ggplot')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 读取数据
from Tool import Tool
def getEntropy(s):
# 找到各个不同取值出现的次数
if not isinstance(s, pd.core.series.Series):
s = pd.Series(s)
prt_ary = s.groupby(by=s).count().values / float(len(s))
# prt_ary = pd.groupby(s , by = s).count().values / float(len(s))
return -(np.log2(prt_ary) * prt_ary).sum()
def phqCluster(data):
m, n = data.shape
kmeans = KMeans(n_clusters=5, random_state=1).fit(data)
return kmeans.cluster_centers_, kmeans.labels_
def pearsonLagSingle(P_device, P_j, timelag): # 时滞-timelag~timelag间的最大pearson
if timelag < 0:
timelag = -timelag
maxPearson = pearsonr(P_device, P_j)[0]
for lag in range(1, timelag + 1):
cand1 = pearsonr(P_device[lag:], P_j[:-lag])[0]
cand2 = pearsonr(P_device[:-lag], P_j[lag:])[0]
maxPearson = max(maxPearson, cand1, cand2)
return maxPearson
# 求i设备的最相关的N个设备(以带时滞的相关系数pearsonLag为例,若需加快可直接降低timelag)
def correlation(P_total, device_index, N, day_point):
# pearson = P_total.corr()
# cov = P_total.cov()
if device_index == -1:
P_device = P_total.apply(lambda x: x.sum(), axis=1)
else:
P_device = P_total.iloc[:, device_index]
timelag = 50
device_pearsonLag = [-1 for j in range(P_total.shape[1])]
for j in range(P_total.shape[1]):
if j == device_index:
continue
P_j = P_total.iloc[:, j]
device_pearsonLag[j] = pearsonLagSingle(P_device, P_j, timelag)
if N >= len(device_pearsonLag):
N = len(device_pearsonLag) - 1
device_pearsonLag = pd.Series(data=device_pearsonLag, index=P_total.columns)
device_pearsonLag.sort_values(ascending=False, inplace=True)
# plt.figure(figsize=(16, 8))
# plt.plot(P_total.iloc[:, i], label="forecast device")
# plt.plot(P_total.loc[:, device_pearsonLag.index[0]], label="corr device 1 : " + str(device_pearsonLag[0]))
# plt.plot(P_total.loc[:, device_pearsonLag.index[1]], label="corr device 2 : " + str(device_pearsonLag[1]))
# plt.plot(P_total.loc[:, device_pearsonLag.index[2]], label="corr device 3 : " + str(device_pearsonLag[2]))
# plt.legend()
# plt.title('corr device with correlation coefficient')
return device_pearsonLag[:N]
# 返回的是两个list,分别为真实值和预测值
def train_forecast(P_total, corr_device, device_index,day_point):
# day_point = 480 # 一天为480个数据点
if device_index == -1: # 对用户分析
P_forecast = P_total.apply(lambda x: x.sum(), axis=1)
else:
P_forecast = P_total.iloc[:, device_index]
y_total = P_forecast[day_point * 7:].reset_index(drop=True)
X_total = pd.DataFrame(index=range(len(y_total)))
timeStamp = pd.Series(P_forecast[day_point * 7:].index)
# 生成输入特征集
X_total['month'] = timeStamp.map(lambda x: x.month)
X_total['weekday'] = timeStamp.map(lambda x: x.day)
X_total['hour'] = timeStamp.map(lambda x: x.hour)
X_total['7dAgo'] = P_forecast[:-day_point * 7].reset_index(drop=True)
X_total['1dAgo'] = P_forecast[day_point * 6:-day_point * 1].reset_index(drop=True)
for i in range(len(corr_device)):
P_corr = P_total.loc[:, corr_device.index[i]]
X_total['7dAgo_corr' + str(i)] = P_corr[:-day_point * 7].reset_index(drop=True)
X_total['1dAgo_corr' + str(i)] = P_corr[day_point * 6:-day_point * 1].reset_index(drop=True)
# min-max归一化
X_norm = (X_total - X_total.min()) / (X_total.max() - X_total.min())
y_norm = (y_total - y_total.min()) / (y_total.max() - y_total.min())
y_min = y_total.min()
y_max = y_total.max()
# 分割训练集测试集
X_train_norm, X_test_norm, y_train_norm, y_test_norm = model_selection.train_test_split(X_norm, y_norm, test_size=0.3)
y_test = y_test_norm * (y_max - y_min) + y_min
y_train = y_train_norm * (y_max - y_min) + y_min
# 训练与测试
# estimator = RandomForestRegressor(n_estimators=1000,n_jobs=-1).fit(X_train_norm, y_train_norm)
other_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
'subsample': 0.7, 'colsample_bytree': 0.6, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
estimator = xgb.XGBRegressor(**other_params).fit(X_train_norm, y_train_norm)
# 测试集误差
y_predict_test_norm = estimator.predict(X_test_norm)
y_predict_test = y_predict_test_norm * (y_max - y_min) + y_min
MAPE_test = np.mean(abs(y_predict_test - y_test) / y_test) * 100
RMSE_test = np.sqrt(np.mean((y_predict_test - y_test) ** 2))
print('MAPE_test: %f, RMSE_test: %f', MAPE_test, RMSE_test)
# 训练集误差
y_predict_train_norm = estimator.predict(X_train_norm)
y_predict_train = y_predict_train_norm * (y_max - y_min) + y_min
MAPE_train = np.mean(abs(y_predict_train - y_train) / y_train) * 100
RMSE_train = np.sqrt(np.mean((y_predict_train - y_train) ** 2))
print('MAPE_train: %f, RMSE_train: %f', MAPE_train, RMSE_train)
# 预测之后一天的负荷
# 生成之后一天的输入特征集
X_nextday = X_total.iloc[-480:, :].reset_index(drop=True)
X_nextday['7dAgo'] = P_forecast[-day_point * 7:-day_point * 6].reset_index(drop=True)
X_nextday['1dAgo'] = P_forecast[-day_point:].reset_index(drop=True)
for i in range(len(corr_device)):
P_corr = P_total.loc[:, corr_device.index[i]]
X_nextday['7dAgo_corr' + str(i)] = P_corr[-day_point * 7:-day_point * 6].reset_index(drop=True)
X_nextday['1dAgo_corr' + str(i)] = P_corr[-day_point:].reset_index(drop=True)
X_nextday_norm = (X_nextday - X_total.min()) / (X_total.max() - X_total.min())
y_predict_nextday_norm = estimator.predict(X_nextday_norm)
y_predict_nextday = y_predict_nextday_norm * (y_max - y_min) + y_min
# 以最后一周(加之后一天)为例作图
y_predict_norm = estimator.predict(X_norm)
y_predict = y_predict_norm * (y_max - y_min) + y_min
plt.figure(figsize=(16, 8))
plt.plot(np.array(y_total)[-7 * day_point:], label="y_true")
plt.plot(np.concatenate((y_predict[-7 * day_point:], y_predict_nextday), axis=0), label="y_predict")
plt.legend()
a, b = np.array(y_total)[-7 * day_point:], np.concatenate((y_predict[-7 * day_point:], y_predict_nextday), axis=0)
# with open("predict.json","w") as f:
# json.dump({'y_true':a.tolist(),'y_pred':b.tolist()},f)
# f.write("\n".join([a.tolist(),b.tolist()]))
return a.tolist(), b.tolist()
def cluster(P_total, device_index, day_point):
if device_index == -1: # 对用户分析
data = np.array(P_total.apply(lambda x: x.sum(), axis=1))
else:
data = np.array(P_total.iloc[:, device_index])
hour_point = day_point // 24
data_hour = data[:len(data) // hour_point * hour_point].reshape([-1, hour_point])
data_day = data[:len(data) // day_point * day_point].reshape([-1, day_point])
# 先进行各时序norm再聚类
data_hour = normalize(data_hour, axis=1, norm='max')
data_day = normalize(data_day, axis=1, norm='max')
kmeans_hour, labels_hour = phqCluster(data_hour)
kmeans_day, labels_day = phqCluster(data_day)
# plt.figure(figsize=(16, 8))
# for i in range(5):
# plt.plot(kmeans_hour[i], label="cluster" + str(i))
# plt.legend()
# plt.title('cluster_hour')
# plt.figure(figsize=(16, 8))
# for i in range(5):
# plt.plot(kmeans_day[i], label="cluster" + str(i))
# plt.legend()
# plt.title('cluster_day')
return kmeans_hour, labels_hour, kmeans_day, labels_day
# return hourList,dayList,kmeans_hour,labels_hour,kmeans_day,labels_day
def profileFeature(P_total, device_index, kmeans_hour, kmeans_day, labels_hour, labels_day, temp8760):
if device_index == -1: # 对用户分析
data = P_total.apply(lambda x: x.sum(), axis=1)
else:
data = P_total.iloc[:, device_index]
staticFeatures = [data.max(), data.min(), data.median(), data.mean(), data.std(), np.mean(np.fft.fft(data)),
np.std(np.fft.fft(data)), kmeans_hour, kmeans_day]
n_hidden_states = 5
hmm_hour = GaussianHMM(n_components=n_hidden_states)
hmm_hour.fit(labels_hour.reshape(-1, 1))
transmat_hour = hmm_hour.transmat_ # 转移特性矩阵
entropy_hour = getEntropy(labels_hour) # 行为信息熵
hmm_day = GaussianHMM(n_components=n_hidden_states)
hmm_day.fit(labels_day.reshape(-1, 1))
transmat_day = hmm_day.transmat_ # 转移特性矩阵
entropy_day = getEntropy(labels_day) # 行为信息熵
dynamicFeatures = [transmat_hour, entropy_hour, transmat_day, entropy_day]
tempload, temp, scattertemp, scatterdataunique = plotTempFeature(data, temp8760)
return staticFeatures, dynamicFeatures, tempload, temp, scattertemp, scatterdataunique
def getData(data, date, day_point):
# date -= datetime.timedelta(days=delta)
dateStr = str(date.year) + '-' + "%0.2d"%(date.month) + '-' + "%0.2d"%(date.day)
res = data[dateStr]
count = 0
while len(res) != day_point:
date -= datetime.timedelta(days=1)
dateStr = str(date.year) + '-' + "%0.2d"%(date.month) + '-' + "%0.2d"%(date.day)
res = data[dateStr]
count += 1
if count > 10:
return 0
return np.array(res)
def baseline(P_total,device_index, year, month, day, day_point):
if device_index == -1: # 对用户分析
data = P_total.apply(lambda x: x.sum(), axis=1)
else:
data = P_total.iloc[:, device_index]
date = datetime.datetime(year, month, day)
date1 = date - datetime.timedelta(days=1)
date2 = date - datetime.timedelta(days=2)
date3 = date - datetime.timedelta(days=3)
date7 = date - datetime.timedelta(days=7)
if date7 < P_total.index[0]:
raise Exception("Error: the gap between selected date with the start date should > 7 days")
data1, data2, data3, data7 = getData(data, date1, day_point), getData(data, date2, day_point), getData(data,
date3,
day_point), getData(
data, date7, day_point)
res = (data1 + data2 + data3 + data7) / 4 # 该设备该日期的能耗基线
# plt.figure(figsize=(16, 8))
# plt.plot(res, label="能耗基线")
# plt.plot(np.array(data[str(date.year) + '-' + str(date.month) + '-' + str(date.day)]), label="实际值")
# plt.legend()
# plt.title('能耗基线与实际值对比')
return res, np.array(data[str(date.year) + '-' + str(date.month) + '-' + str(date.day)])
def plotTempFeature(data, temp8760):
data.index = data.index.strftime("%Y-%m-%d %H")
data_unique = data[~data.index.duplicated()]
begin = data_unique.index[0]
dd_begin = datetime.datetime.strptime(begin, "%Y-%m-%d %H")
index_begin = (dd_begin.timetuple().tm_yday - 1) + dd_begin.timetuple().tm_hour
temp = temp8760[index_begin:index_begin + data_unique.shape[0]].squeeze()
# plt.figure(figsize=(16, 8))
# plt.scatter(temp, data_unique)
# plt.title('负荷温度特性')
# plt.xlabel('温度(℃)')
# plt.ylabel('负荷(KW)')
#
# fig = plt.figure(figsize=(16, 8))
# ax = fig.add_subplot(111)
# lns1 = ax.plot(list(range(200)), data_unique[:200], label='负荷', color='red')
# ax2 = plt.twinx()
# lns2 = ax2.plot(list(range(200)), temp[:200], label='温度', color='blue')
# lns = lns1 + lns2
# labs = [l.get_label() for l in lns]
# ax.legend(loc=0)
# ax.legend(lns, labs, loc=0)
return data_unique[:200], temp[:200], temp, data_unique
# @click.command()
# @click.option('--name',default='三相总有功功率')
def oldMain(factory, line, device, measurePoint='三相总有功功率'):
# print(factory)
# print(line)
# print(device)
#
# dataDir = "data/"+factory+"/"
# # measurePoint = '' # 所分析数据项
# # device = '低压总出' # 所要预测设备(全建筑总出)
# # if os.path.exists('data\\tmp\\P_total.csv'):
# # P_total = pd.read_csv('data\\tmp\\P_total.csv', index_col=0)
# # else:
# P_total = readData(dataDir, (measurePoint))
# P_total.to_csv('data/tmp/P_total.csv')
# P_total.index = pd.to_datetime(P_total.index)
# # 补全device名并得到device_index
# for i in range(P_total.shape[1]):
# if device in P_total.columns[i]:
# device = P_total.columns[i]
# device_index = i
# if device_index == -1:
# raise NameError('Check the device!')
# 不需要时间参数
# 功能一:基于自适应时滞pearson相关系数找最相关设备
P_total, device_index = Tool.getP_total(factory, line, device, measurePoint)
print("—————————————————一、时空相关性分析(图1)—————————————————————")
corr_device = correlation(P_total, device_index, 3)
print('corr_device:', corr_device)
# 功能二:进行负荷预测模型的训练与测试
# 需要返回什么数据/模型可自行修改函数
print("—————————————————二、用户负荷建模与预测(图2)—————————————————————")
a, b = train_forecast(P_total, corr_device, device_index)
return a.tolist(), b.tolist()
# 功能三:以小时/天尺度对负荷数据进行聚类
# 不需要时间参数
# 需要返回什么数据/模型可自行修改函数
# print("—————————————————三、多时间尺度用能模式挖掘(图3图4)—————————————————————")
# cluster(np.array(P_total.iloc[:, device_index]))
# if __name__=='__main__':
# oldMain('常州天和印染有限公司2','','低压总出')