示例#1
0
async def send_req(order, pos, data):
    ip = pos.ip
    url = 'http://' + ip + '/cgi-bin/epos/service.cgi?devid=local_printer&timeout=30000'
    headers = {
        'Content-Type': 'text/xml; charset=utf-8',
        'If-Modified-Since': 'Thu, 01 Jan 1970 00:00:00 GMT',
        'SOAPAction': '""'
    }
    f = partial(requests.post, url, data=data.encode(), headers=headers)
    loop = asyncio.get_event_loop()
    try:
        res = await loop.run_in_executor(None, f)
        if res.status_code is 200:
            tree = ET.fromstring(res.content)
            success = tree[0][0].get('success')
            if success is not 'false':
                status = tree[0][0].get('status')
                if not int(status) & 2:
                    pos.error = pos_error(status)
                    fail = PrintFailed()
                    fail.order = order
                    fail.pos = pos
                    db.session.add(fail)
                else:
                    pos.error = ""
            db.session.commit()
    except (Exception, OSError) as e:
        pos.error = str(e)
        db.session.commit()
示例#2
0
async def send_req(pos, data=None, order=None, checkout=None):
    ip = pos.ip
    url = 'http://' + ip + '/cgi-bin/epos/service.cgi?devid=local_printer&timeout=30000'
    headers = {
        'Content-Type': 'text/xml; charset=utf-8',
        'If-Modified-Since': 'Thu, 01 Jan 1970 00:00:00 GMT',
        'SOAPAction': '""'
    }
    if not data:
        data = '<s:Envelope xmlns:s="http://schemas.xmlsoap.org/soap/envelope/">\
<s:Body>\
<epos-print xmlns="http://www.epson-pos.com/schemas/2011/03/epos-print"></epos-print>\
</s:Body>\
</s:Envelope>'

    f = partial(requests.post, url, data=data.encode(), headers=headers)
    loop = asyncio.get_event_loop()
    try:
        res = await loop.run_in_executor(None, f)
    except (Exception, OSError) as e:
        pos.error = str(e)
        db.session.commit()

    if res.status_code is 200:
        tree = ET.fromstring(res.content)
        success = tree[0][0].get('success')
        if success is not 'false':
            status = tree[0][0].get('status')
            if not int(status) & 2:
                pos.error = pos_error(status)
            else:
                if order:
                    print_failed = PrintFailed.query.filter_by(
                        pos=pos, order=order).first()
                    db.session.delete(print_failed)
                    db.session.commit()
                if checkout:
                    checkout.printed = True
                    db.session.commit()
                pos.error = ""
        db.session.commit()
示例#3
0
def print_bill(pos, checkout, checkout_info, check_price):
    uuid = checkout.token
    time = checkout.checkout_time
    d_name = checkout.desk_name
    s_name = checkout.staff.name

    data = print_bill_format(uuid, time, d_name, s_name, checkout_info,
                             check_price)

    url = 'http://' + pos.ip + '/cgi-bin/epos/service.cgi?devid=local_printer&timeout=30000'
    headers = {
        'Content-Type': 'text/xml; charset=utf-8',
        'If-Modified-Since': 'Thu, 01 Jan 1970 00:00:00 GMT',
        'SOAPAction': '""'
    }
    try:
        res = requests.post(url, data=data.encode(), headers=headers)
        if res.status_code is 200:
            tree = ET.fromstring(res.content)
            success = tree[0][0].get('success')
            if success is not 'false':
                status = tree[0][0].get('status')
                if not int(status) & 2:
                    config.checkout_pos_working = False
                    checkout.printed = False
                    pos.error = pos_error(status)
                else:
                    config.checkout_pos_working = True
                    pos.error = ""
                save_printer_status(
                    dict(checkout_pos_working=config.checkout_pos_working,
                         order_pos_working=config.order_pos_working))
            db.session.commit()
    except (Exception, OSError) as e:
        config.checkout_pos_working = False
        save_printer_status(
            dict(checkout_pos_working=config.checkout_pos_working,
                 order_pos_working=config.order_pos_working))
        pos.error = str(e)
        db.session.commit()
示例#4
0
def main():
    train_data = utils.gongcan_to_ll()
    # 删除原有的ID,不作为训练特征
    for i in range(1, 8):
        train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True)
        train_data.drop(['CellID_' + str(i)], axis=1, inplace=True)

    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)
    rel_lon = []
    rel_lat = []
    for index, row in train_data.iterrows():
        rel_lon.append(row['Longitude'] - row['Longitude_1'])
        rel_lat.append(row['Latitude'] - row['Latitude_1'])

    train_data['rel_Longitude'] = np.array(rel_lon)
    train_data['rel_Latitude'] = np.array(rel_lat)

    # features和labels
    train_data.set_index(['Longitude_1', 'Latitude_1'],
                         inplace=True,
                         drop=False)
    train_data.sort_index(inplace=True)
    ids = list(set(train_data.index.tolist()))

    errors_all = []
    amount = []
    for id in ids:
        MS_datas = train_data.loc[id]
        X = MS_datas.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y = MS_datas[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的
        random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

        # 随机森林
        print("MS {}".format(id))
        errors = []
        for i in range(10):

            # 切分训练集和验证集
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=random_states[i])

            regr = RandomForestRegressor(max_depth=20, random_state=0)
            y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5],
                                                 axis=1)).predict(X_test)

            error = utils.pos_error(y_test, y_pred)
            errors.append(error)

            # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred)
            # errors.append(utils.pos_error(y_test, y_pred))

        # 将每个数据集的点做出来
        plt.title("Median error: %.3f" %
                  np.percentile(np.array(errors).mean(axis=0), 50) +
                  " Data amount: {}".format(X.shape[0]))
        ax = plt.gca()
        ax.get_xaxis().get_major_formatter().set_useOffset(False)
        plt.scatter(y[:, 2], y[:, 3])
        plt.xlim([lb_Longitude, rt_Longitude])
        plt.ylim([lb_Latitude, rt_Latitude])
        plt.show()

        # print("Different data amount: {}".format(len(set(X[:,0]))))
        print("Data amount: {}".format(X.shape[0]))
        print("Median error: {}".format(
            np.percentile(np.array(errors).mean(axis=0), 50)))
        errors_all.append([id, errors])
        amount.append(
            [X.shape[0],
             np.percentile(np.array(errors).mean(axis=0), 50)])
        # amount.append([len(set(X[:, 0])), np.percentile(np.array(errors).mean(axis=0), 50)])

        print("****************************")
    utils.cdf_figure(errors_all)
    utils.mean_figure(errors_all)
    # utils.cdf_figure_overall(errors_all)

    # 将每个基站的中位误差和总的数据集个数输出
    amount = np.array(amount)
    amount = amount[amount[:, 0].argsort()]
    for a in amount:
        print(a)

    return errors_all
示例#5
0
文件: e.py 项目: FoxerLee/SSE-DAM2018
def main():
    train_data = utils.gongcan_to_ll()
    # 删除原有的ID,不作为训练特征
    for i in range(1, 8):
        train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True)
        train_data.drop(['CellID_' + str(i)], axis=1, inplace=True)

    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)
    rel_lon = []
    rel_lat = []
    # print(train_data)
    for index, row in train_data.iterrows():
        rel_lon.append(row['Longitude'] - row['Longitude_1'])
        rel_lat.append(row['Latitude'] - row['Latitude_1'])

    train_data['rel_Longitude'] = np.array(rel_lon)
    train_data['rel_Latitude'] = np.array(rel_lat)

    # features和labels
    train_data.set_index(['Longitude_1', 'Latitude_1'],
                         inplace=True,
                         drop=False)
    train_data.sort_index(inplace=True)
    ids = list(set(train_data.index.tolist()))

    # 利用 KMeans 聚类,将不同的基站通过距离进行聚类
    y_pred = KMeans(n_init=1, random_state=0).fit_predict(ids)
    # print(y_pred)

    # 做出聚类后的结果
    plt.title("Kmeans Result")
    x = [id[0] for id in ids]
    y = [id[1] for id in ids]
    plt.scatter(x, y, c=y_pred)
    ax = plt.gca()
    ax.get_xaxis().get_major_formatter().set_useOffset(False)
    # plt.xlim([lb_Longitude, rt_Longitude])
    # plt.ylim([lb_Latitude, rt_Latitude])
    plt.show()

    ids = [(id, cluster) for (id, cluster) in zip(ids, y_pred)]
    # print(ids)
    errors_all = []
    median_errors = []
    for id in ids:
        MS_datas = train_data.loc[id[0]]
        X = MS_datas.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y = MS_datas[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的
        random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

        # 随机森林
        print("MS {}".format(id))
        errors = []
        for i in range(10):
            # 切分训练集和验证集
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=random_states[i])

            regr = RandomForestRegressor(max_depth=20, random_state=0)
            y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5],
                                                 axis=1)).predict(X_test)
            error = utils.pos_error(y_test, y_pred)
            errors.append(error)

            # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred)
            # errors.append(utils.pos_error(y_test, y_pred))
        median_error = np.percentile(np.array(errors).mean(axis=0), 50)
        print("Median error: {}".format(median_error))
        median_errors.append([id[0], median_error, id[1]])
        errors_all.append([id, errors])
        print("****************************")
    median_errors = DataFrame(median_errors,
                              columns=['id', 'median_error', 'cluster'])
    median_errors.set_index(['median_error'], inplace=True, drop=False)
    median_errors.sort_index(inplace=True)

    MS_number = median_errors.shape[0]
    topk_worst = median_errors.iloc[int(MS_number * 0.8):][['id', 'cluster'
                                                            ]].as_matrix()

    old_errors = []  # 用于存储没有修正前的 top k- 的所有 error
    for error in errors_all:
        if error[0][0] in topk_worst[:, 0].tolist():
            old_errors.append([error[0], error[1]])

    print("\n")
    print("Start correction")
    print("\n")

    new_errors = []  # 用于存储修正后的 top k- 的所有 error
    for worst in topk_worst:
        similars = median_errors[median_errors['cluster'] ==
                                 worst[1]].as_matrix().tolist()

        MS_datas = worst_data = train_data.loc[worst[0]]
        X_worst = worst_data.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y_worst = worst_data[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()
        for similar in similars:
            MS_datas = pd.concat([MS_datas, train_data.loc[similar[0]]])

        # 随机抽样
        # MS_datas = MS_datas.sample(frac=0.8)

        X = MS_datas.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y = MS_datas[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        # X = []
        # y = []
        #
        # # 筛选,删掉距离原始数据集过远的数据
        # for i, j in zip(X_, y_):
        #     error = utils.haversine(j[4], j[5], worst[0][0], worst[0][1])
        #     if error > 500:
        #         continue
        #     X.append(i)
        #     y.append(j)
        # X = np.array(X)
        # y = np.array(y)

        # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的,同时每一次的实验的数据集也是一样的,从而提升结果的可信度
        random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

        # 随机森林
        print("MS {}".format(worst))
        errors = []
        for i in range(10):
            # 切分训练集和验证集
            X_train, _, y_train, _ = train_test_split(
                X, y, test_size=0.2, random_state=random_states[i])
            _, X_test, _, y_test = train_test_split(
                X_worst, y_worst, test_size=0.2, random_state=random_states[i])
            regr = RandomForestRegressor(max_depth=20, random_state=0)
            y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5],
                                                 axis=1)).predict(X_test)
            error = utils.pos_error(y_test, y_pred)
            errors.append(error)

        # 做出每个基站用于加入了新的数据集后的所有训练数据点位置和原始该 MS 基站的数据点位置
        plt.title("Median error: %.3f" %
                  np.percentile(np.array(errors).mean(axis=0), 50))
        ax = plt.gca()
        ax.get_xaxis().get_major_formatter().set_useOffset(False)
        plt.scatter(y[:, 2], y[:, 3], label='new data')
        plt.scatter(y_worst[:, 2], y_worst[:, 3], label='old data')

        plt.xlim([lb_Longitude, rt_Longitude])
        plt.ylim([lb_Latitude, rt_Latitude])
        plt.legend()
        plt.show()

        median_error = np.percentile(np.array(errors).mean(axis=0), 50)
        print("Median error: {}".format(median_error))
        new_errors.append([worst, errors])
        print("****************************")

    utils.cdf_figure(old_errors, new_errors)
    utils.mean_figure(old_errors, new_errors)
示例#6
0
文件: a.py 项目: FoxerLee/SSE-DAM2018
def main():
    ll_data_2g = utils.gongcan_to_ll()
    train_data = utils.ll_to_grid(ll_data_2g)

    # print(train_data)
    # 删除原有的ID,不作为训练特征
    for i in range(1, 8):
        train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True)
        train_data.drop(['CellID_' + str(i)], axis=1, inplace=True)
    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)

    # features和labels
    X = train_data.drop(
        ['MRTime', 'Longitude', 'Latitude', 'Num_connected', 'grid_num'],
        axis=1,
        inplace=False).as_matrix()
    y = train_data[['grid_num', 'Longitude', 'Latitude']].as_matrix()
    # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的
    random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

    errors_all = []
    top10_pres_all = []
    top10_recalls_all = []
    top10_fs_all = []
    overall_pres_all = []

    # 高斯朴素贝叶斯分类器
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        gnb = GaussianNB()
        y_pred = gnb.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("Gaussian")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # K近邻分类器
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        neigh = KNeighborsClassifier()
        y_pred = neigh.fit(np.delete(X_train, 0, axis=1),
                           y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("KNeighbors")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # 决策树分类器
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        clf = DecisionTreeClassifier()
        y_pred = clf.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("DecisionTree")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # 随机森林
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        clf = RandomForestClassifier(max_depth=20, random_state=0)
        y_pred = clf.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("RandomForest")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # AdaBoost
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        clf = AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(max_depth=20),
            learning_rate=0.01,
            n_estimators=30,
            algorithm='SAMME.R')
        y_pred = clf.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("AdaBoost")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # Bagging
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        clf = BaggingClassifier(n_estimators=20)
        y_pred = clf.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("Bagging")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # GradientBoosting
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        print(i)
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        clf = GradientBoostingClassifier(n_estimators=60, learning_rate=0.01)
        y_pred = clf.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("GradientBoosting")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    utils.cdf_figure(errors_all)
    utils.figure(overall_pres_all, top10_pres_all, top10_recalls_all,
                 top10_fs_all)
示例#7
0
def main():
    train_data = utils.gongcan_to_ll()
    # 删除原有的ID,不作为训练特征
    for i in range(1, 8):
        train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True)
        train_data.drop(['CellID_' + str(i)], axis=1, inplace=True)

    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)
    rel_lon = []
    rel_lat = []
    # print(train_data)
    for index, row in train_data.iterrows():
        rel_lon.append(row['Longitude'] - row['Longitude_1'])
        rel_lat.append(row['Latitude'] - row['Latitude_1'])

    train_data['rel_Longitude'] = np.array(rel_lon)
    train_data['rel_Latitude'] = np.array(rel_lat)

    # features和labels
    train_data.set_index(['Longitude_1', 'Latitude_1'],
                         inplace=True,
                         drop=False)
    train_data.sort_index(inplace=True)
    ids = list(set(train_data.index.tolist()))
    # print(ids)

    # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的,同时每一次的实验的数据集也是一样的,从而提升结果的可信度
    random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

    errors_all = []
    median_errors = []
    for id in ids:
        MS_datas = train_data.loc[id]
        X = MS_datas.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y = MS_datas[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        # 随机森林
        print("MS {}".format(id))
        errors = []
        for i in range(10):

            # 切分训练集和验证集
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=random_states[i])

            regr = RandomForestRegressor(max_depth=20, random_state=0)
            y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5],
                                                 axis=1)).predict(X_test)
            error = utils.pos_error(y_test, y_pred)
            errors.append(error)

            # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred)
            # errors.append(utils.pos_error(y_test, y_pred))
        median_error = np.percentile(np.array(errors).mean(axis=0), 50)
        print("Median error: {}".format(median_error))
        median_errors.append([id, median_error])
        errors_all.append([id, errors])
        print("****************************")
    median_errors = DataFrame(median_errors, columns=['id', 'median_error'])
    median_errors.set_index(['median_error'], inplace=True, drop=False)
    median_errors.sort_index(inplace=True)
    # print(median_errors)

    MS_number = median_errors.shape[0]
    topk_best = median_errors.iloc[:int(MS_number *
                                        0.2)]['id'].as_matrix().tolist()
    topk_worst = median_errors.iloc[int(MS_number *
                                        0.8):]['id'].as_matrix().tolist()

    old_errors = []  # 用于存储没有修正前的 top k- 的所有 error
    for error in errors_all:
        if error[0] in topk_worst:
            old_errors.append([error[0], error[1]])

    # 获取top k+的数据
    best_data = DataFrame()
    for best in topk_best:
        best_data = pd.concat([best_data, train_data.loc[best]], axis=0)

    # print(best_data)
    # best_data = best_data.sample(frac=0.7)
    # print(best_data)
    print("\n")
    print("Start correction")
    print("\n")
    new_errors = []  # 用于存储修正后的 top k- 的所有 error
    for worst in topk_worst:
        MS_datas = pd.concat([train_data.loc[worst], best_data])
        # MS_datas = best_data
        X = MS_datas.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y = MS_datas[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        worst_data = train_data.loc[worst]
        X_worst = worst_data.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y_worst = worst_data[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        # 随机森林
        print("MS {}".format(worst))
        errors = []
        for i in range(10):
            # 切分训练集和验证集
            X_train, _, y_train, _ = train_test_split(
                X, y, test_size=0.2, random_state=random_states[i])
            _, X_test, _, y_test = train_test_split(
                X_worst, y_worst, test_size=0.2, random_state=random_states[i])
            regr = RandomForestRegressor(max_depth=20, random_state=0)
            y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5],
                                                 axis=1)).predict(X_test)
            error = utils.pos_error(y_test, y_pred)
            errors.append(error)

        # 做出每个基站用于加入了新的数据集后的所有训练数据点位置和原始该 MS 基站的数据点位置
        plt.title("Median error: %.3f" %
                  np.percentile(np.array(errors).mean(axis=0), 50))
        ax = plt.gca()
        ax.get_xaxis().get_major_formatter().set_useOffset(False)
        plt.scatter(y[:, 2], y[:, 3], label='new data')
        plt.scatter(y_worst[:, 2], y_worst[:, 3], label='old data')

        plt.xlim([lb_Longitude, rt_Longitude])
        plt.ylim([lb_Latitude, rt_Latitude])
        plt.legend()
        plt.show()

        new_errors.append([worst, errors])
        median_error = np.percentile(np.array(errors).mean(axis=0), 50)
        print("Median error: {}".format(median_error))
        # median_errors.append([worst, median_error])
        # errors_all.append([id, errors])
        print("****************************")

    utils.cdf_figure(old_errors, new_errors)
    utils.mean_figure(old_errors, new_errors)