async def send_req(order, pos, data): ip = pos.ip url = 'http://' + ip + '/cgi-bin/epos/service.cgi?devid=local_printer&timeout=30000' headers = { 'Content-Type': 'text/xml; charset=utf-8', 'If-Modified-Since': 'Thu, 01 Jan 1970 00:00:00 GMT', 'SOAPAction': '""' } f = partial(requests.post, url, data=data.encode(), headers=headers) loop = asyncio.get_event_loop() try: res = await loop.run_in_executor(None, f) if res.status_code is 200: tree = ET.fromstring(res.content) success = tree[0][0].get('success') if success is not 'false': status = tree[0][0].get('status') if not int(status) & 2: pos.error = pos_error(status) fail = PrintFailed() fail.order = order fail.pos = pos db.session.add(fail) else: pos.error = "" db.session.commit() except (Exception, OSError) as e: pos.error = str(e) db.session.commit()
async def send_req(pos, data=None, order=None, checkout=None): ip = pos.ip url = 'http://' + ip + '/cgi-bin/epos/service.cgi?devid=local_printer&timeout=30000' headers = { 'Content-Type': 'text/xml; charset=utf-8', 'If-Modified-Since': 'Thu, 01 Jan 1970 00:00:00 GMT', 'SOAPAction': '""' } if not data: data = '<s:Envelope xmlns:s="http://schemas.xmlsoap.org/soap/envelope/">\ <s:Body>\ <epos-print xmlns="http://www.epson-pos.com/schemas/2011/03/epos-print"></epos-print>\ </s:Body>\ </s:Envelope>' f = partial(requests.post, url, data=data.encode(), headers=headers) loop = asyncio.get_event_loop() try: res = await loop.run_in_executor(None, f) except (Exception, OSError) as e: pos.error = str(e) db.session.commit() if res.status_code is 200: tree = ET.fromstring(res.content) success = tree[0][0].get('success') if success is not 'false': status = tree[0][0].get('status') if not int(status) & 2: pos.error = pos_error(status) else: if order: print_failed = PrintFailed.query.filter_by( pos=pos, order=order).first() db.session.delete(print_failed) db.session.commit() if checkout: checkout.printed = True db.session.commit() pos.error = "" db.session.commit()
def print_bill(pos, checkout, checkout_info, check_price): uuid = checkout.token time = checkout.checkout_time d_name = checkout.desk_name s_name = checkout.staff.name data = print_bill_format(uuid, time, d_name, s_name, checkout_info, check_price) url = 'http://' + pos.ip + '/cgi-bin/epos/service.cgi?devid=local_printer&timeout=30000' headers = { 'Content-Type': 'text/xml; charset=utf-8', 'If-Modified-Since': 'Thu, 01 Jan 1970 00:00:00 GMT', 'SOAPAction': '""' } try: res = requests.post(url, data=data.encode(), headers=headers) if res.status_code is 200: tree = ET.fromstring(res.content) success = tree[0][0].get('success') if success is not 'false': status = tree[0][0].get('status') if not int(status) & 2: config.checkout_pos_working = False checkout.printed = False pos.error = pos_error(status) else: config.checkout_pos_working = True pos.error = "" save_printer_status( dict(checkout_pos_working=config.checkout_pos_working, order_pos_working=config.order_pos_working)) db.session.commit() except (Exception, OSError) as e: config.checkout_pos_working = False save_printer_status( dict(checkout_pos_working=config.checkout_pos_working, order_pos_working=config.order_pos_working)) pos.error = str(e) db.session.commit()
def main(): train_data = utils.gongcan_to_ll() # 删除原有的ID,不作为训练特征 for i in range(1, 8): train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True) train_data.drop(['CellID_' + str(i)], axis=1, inplace=True) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) rel_lon = [] rel_lat = [] for index, row in train_data.iterrows(): rel_lon.append(row['Longitude'] - row['Longitude_1']) rel_lat.append(row['Latitude'] - row['Latitude_1']) train_data['rel_Longitude'] = np.array(rel_lon) train_data['rel_Latitude'] = np.array(rel_lat) # features和labels train_data.set_index(['Longitude_1', 'Latitude_1'], inplace=True, drop=False) train_data.sort_index(inplace=True) ids = list(set(train_data.index.tolist())) errors_all = [] amount = [] for id in ids: MS_datas = train_data.loc[id] X = MS_datas.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y = MS_datas[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的 random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] # 随机森林 print("MS {}".format(id)) errors = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) regr = RandomForestRegressor(max_depth=20, random_state=0) y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5], axis=1)).predict(X_test) error = utils.pos_error(y_test, y_pred) errors.append(error) # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred) # errors.append(utils.pos_error(y_test, y_pred)) # 将每个数据集的点做出来 plt.title("Median error: %.3f" % np.percentile(np.array(errors).mean(axis=0), 50) + " Data amount: {}".format(X.shape[0])) ax = plt.gca() ax.get_xaxis().get_major_formatter().set_useOffset(False) plt.scatter(y[:, 2], y[:, 3]) plt.xlim([lb_Longitude, rt_Longitude]) plt.ylim([lb_Latitude, rt_Latitude]) plt.show() # print("Different data amount: {}".format(len(set(X[:,0])))) print("Data amount: {}".format(X.shape[0])) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) errors_all.append([id, errors]) amount.append( [X.shape[0], np.percentile(np.array(errors).mean(axis=0), 50)]) # amount.append([len(set(X[:, 0])), np.percentile(np.array(errors).mean(axis=0), 50)]) print("****************************") utils.cdf_figure(errors_all) utils.mean_figure(errors_all) # utils.cdf_figure_overall(errors_all) # 将每个基站的中位误差和总的数据集个数输出 amount = np.array(amount) amount = amount[amount[:, 0].argsort()] for a in amount: print(a) return errors_all
def main(): train_data = utils.gongcan_to_ll() # 删除原有的ID,不作为训练特征 for i in range(1, 8): train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True) train_data.drop(['CellID_' + str(i)], axis=1, inplace=True) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) rel_lon = [] rel_lat = [] # print(train_data) for index, row in train_data.iterrows(): rel_lon.append(row['Longitude'] - row['Longitude_1']) rel_lat.append(row['Latitude'] - row['Latitude_1']) train_data['rel_Longitude'] = np.array(rel_lon) train_data['rel_Latitude'] = np.array(rel_lat) # features和labels train_data.set_index(['Longitude_1', 'Latitude_1'], inplace=True, drop=False) train_data.sort_index(inplace=True) ids = list(set(train_data.index.tolist())) # 利用 KMeans 聚类,将不同的基站通过距离进行聚类 y_pred = KMeans(n_init=1, random_state=0).fit_predict(ids) # print(y_pred) # 做出聚类后的结果 plt.title("Kmeans Result") x = [id[0] for id in ids] y = [id[1] for id in ids] plt.scatter(x, y, c=y_pred) ax = plt.gca() ax.get_xaxis().get_major_formatter().set_useOffset(False) # plt.xlim([lb_Longitude, rt_Longitude]) # plt.ylim([lb_Latitude, rt_Latitude]) plt.show() ids = [(id, cluster) for (id, cluster) in zip(ids, y_pred)] # print(ids) errors_all = [] median_errors = [] for id in ids: MS_datas = train_data.loc[id[0]] X = MS_datas.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y = MS_datas[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的 random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] # 随机森林 print("MS {}".format(id)) errors = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) regr = RandomForestRegressor(max_depth=20, random_state=0) y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5], axis=1)).predict(X_test) error = utils.pos_error(y_test, y_pred) errors.append(error) # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred) # errors.append(utils.pos_error(y_test, y_pred)) median_error = np.percentile(np.array(errors).mean(axis=0), 50) print("Median error: {}".format(median_error)) median_errors.append([id[0], median_error, id[1]]) errors_all.append([id, errors]) print("****************************") median_errors = DataFrame(median_errors, columns=['id', 'median_error', 'cluster']) median_errors.set_index(['median_error'], inplace=True, drop=False) median_errors.sort_index(inplace=True) MS_number = median_errors.shape[0] topk_worst = median_errors.iloc[int(MS_number * 0.8):][['id', 'cluster' ]].as_matrix() old_errors = [] # 用于存储没有修正前的 top k- 的所有 error for error in errors_all: if error[0][0] in topk_worst[:, 0].tolist(): old_errors.append([error[0], error[1]]) print("\n") print("Start correction") print("\n") new_errors = [] # 用于存储修正后的 top k- 的所有 error for worst in topk_worst: similars = median_errors[median_errors['cluster'] == worst[1]].as_matrix().tolist() MS_datas = worst_data = train_data.loc[worst[0]] X_worst = worst_data.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y_worst = worst_data[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() for similar in similars: MS_datas = pd.concat([MS_datas, train_data.loc[similar[0]]]) # 随机抽样 # MS_datas = MS_datas.sample(frac=0.8) X = MS_datas.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y = MS_datas[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() # X = [] # y = [] # # # 筛选,删掉距离原始数据集过远的数据 # for i, j in zip(X_, y_): # error = utils.haversine(j[4], j[5], worst[0][0], worst[0][1]) # if error > 500: # continue # X.append(i) # y.append(j) # X = np.array(X) # y = np.array(y) # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的,同时每一次的实验的数据集也是一样的,从而提升结果的可信度 random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] # 随机森林 print("MS {}".format(worst)) errors = [] for i in range(10): # 切分训练集和验证集 X_train, _, y_train, _ = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) _, X_test, _, y_test = train_test_split( X_worst, y_worst, test_size=0.2, random_state=random_states[i]) regr = RandomForestRegressor(max_depth=20, random_state=0) y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5], axis=1)).predict(X_test) error = utils.pos_error(y_test, y_pred) errors.append(error) # 做出每个基站用于加入了新的数据集后的所有训练数据点位置和原始该 MS 基站的数据点位置 plt.title("Median error: %.3f" % np.percentile(np.array(errors).mean(axis=0), 50)) ax = plt.gca() ax.get_xaxis().get_major_formatter().set_useOffset(False) plt.scatter(y[:, 2], y[:, 3], label='new data') plt.scatter(y_worst[:, 2], y_worst[:, 3], label='old data') plt.xlim([lb_Longitude, rt_Longitude]) plt.ylim([lb_Latitude, rt_Latitude]) plt.legend() plt.show() median_error = np.percentile(np.array(errors).mean(axis=0), 50) print("Median error: {}".format(median_error)) new_errors.append([worst, errors]) print("****************************") utils.cdf_figure(old_errors, new_errors) utils.mean_figure(old_errors, new_errors)
def main(): ll_data_2g = utils.gongcan_to_ll() train_data = utils.ll_to_grid(ll_data_2g) # print(train_data) # 删除原有的ID,不作为训练特征 for i in range(1, 8): train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True) train_data.drop(['CellID_' + str(i)], axis=1, inplace=True) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) # features和labels X = train_data.drop( ['MRTime', 'Longitude', 'Latitude', 'Num_connected', 'grid_num'], axis=1, inplace=False).as_matrix() y = train_data[['grid_num', 'Longitude', 'Latitude']].as_matrix() # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的 random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] errors_all = [] top10_pres_all = [] top10_recalls_all = [] top10_fs_all = [] overall_pres_all = [] # 高斯朴素贝叶斯分类器 start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) gnb = GaussianNB() y_pred = gnb.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("Gaussian") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # K近邻分类器 start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) neigh = KNeighborsClassifier() y_pred = neigh.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("KNeighbors") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # 决策树分类器 start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) clf = DecisionTreeClassifier() y_pred = clf.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("DecisionTree") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # 随机森林 start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) clf = RandomForestClassifier(max_depth=20, random_state=0) y_pred = clf.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("RandomForest") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # AdaBoost start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) clf = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=20), learning_rate=0.01, n_estimators=30, algorithm='SAMME.R') y_pred = clf.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("AdaBoost") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # Bagging start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) clf = BaggingClassifier(n_estimators=20) y_pred = clf.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("Bagging") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # GradientBoosting start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): print(i) # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) clf = GradientBoostingClassifier(n_estimators=60, learning_rate=0.01) y_pred = clf.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("GradientBoosting") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") utils.cdf_figure(errors_all) utils.figure(overall_pres_all, top10_pres_all, top10_recalls_all, top10_fs_all)
def main(): train_data = utils.gongcan_to_ll() # 删除原有的ID,不作为训练特征 for i in range(1, 8): train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True) train_data.drop(['CellID_' + str(i)], axis=1, inplace=True) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) rel_lon = [] rel_lat = [] # print(train_data) for index, row in train_data.iterrows(): rel_lon.append(row['Longitude'] - row['Longitude_1']) rel_lat.append(row['Latitude'] - row['Latitude_1']) train_data['rel_Longitude'] = np.array(rel_lon) train_data['rel_Latitude'] = np.array(rel_lat) # features和labels train_data.set_index(['Longitude_1', 'Latitude_1'], inplace=True, drop=False) train_data.sort_index(inplace=True) ids = list(set(train_data.index.tolist())) # print(ids) # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的,同时每一次的实验的数据集也是一样的,从而提升结果的可信度 random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] errors_all = [] median_errors = [] for id in ids: MS_datas = train_data.loc[id] X = MS_datas.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y = MS_datas[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() # 随机森林 print("MS {}".format(id)) errors = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) regr = RandomForestRegressor(max_depth=20, random_state=0) y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5], axis=1)).predict(X_test) error = utils.pos_error(y_test, y_pred) errors.append(error) # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred) # errors.append(utils.pos_error(y_test, y_pred)) median_error = np.percentile(np.array(errors).mean(axis=0), 50) print("Median error: {}".format(median_error)) median_errors.append([id, median_error]) errors_all.append([id, errors]) print("****************************") median_errors = DataFrame(median_errors, columns=['id', 'median_error']) median_errors.set_index(['median_error'], inplace=True, drop=False) median_errors.sort_index(inplace=True) # print(median_errors) MS_number = median_errors.shape[0] topk_best = median_errors.iloc[:int(MS_number * 0.2)]['id'].as_matrix().tolist() topk_worst = median_errors.iloc[int(MS_number * 0.8):]['id'].as_matrix().tolist() old_errors = [] # 用于存储没有修正前的 top k- 的所有 error for error in errors_all: if error[0] in topk_worst: old_errors.append([error[0], error[1]]) # 获取top k+的数据 best_data = DataFrame() for best in topk_best: best_data = pd.concat([best_data, train_data.loc[best]], axis=0) # print(best_data) # best_data = best_data.sample(frac=0.7) # print(best_data) print("\n") print("Start correction") print("\n") new_errors = [] # 用于存储修正后的 top k- 的所有 error for worst in topk_worst: MS_datas = pd.concat([train_data.loc[worst], best_data]) # MS_datas = best_data X = MS_datas.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y = MS_datas[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() worst_data = train_data.loc[worst] X_worst = worst_data.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y_worst = worst_data[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() # 随机森林 print("MS {}".format(worst)) errors = [] for i in range(10): # 切分训练集和验证集 X_train, _, y_train, _ = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) _, X_test, _, y_test = train_test_split( X_worst, y_worst, test_size=0.2, random_state=random_states[i]) regr = RandomForestRegressor(max_depth=20, random_state=0) y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5], axis=1)).predict(X_test) error = utils.pos_error(y_test, y_pred) errors.append(error) # 做出每个基站用于加入了新的数据集后的所有训练数据点位置和原始该 MS 基站的数据点位置 plt.title("Median error: %.3f" % np.percentile(np.array(errors).mean(axis=0), 50)) ax = plt.gca() ax.get_xaxis().get_major_formatter().set_useOffset(False) plt.scatter(y[:, 2], y[:, 3], label='new data') plt.scatter(y_worst[:, 2], y_worst[:, 3], label='old data') plt.xlim([lb_Longitude, rt_Longitude]) plt.ylim([lb_Latitude, rt_Latitude]) plt.legend() plt.show() new_errors.append([worst, errors]) median_error = np.percentile(np.array(errors).mean(axis=0), 50) print("Median error: {}".format(median_error)) # median_errors.append([worst, median_error]) # errors_all.append([id, errors]) print("****************************") utils.cdf_figure(old_errors, new_errors) utils.mean_figure(old_errors, new_errors)