示例#1
0
def predictOneTrain_LR_Split(shopid, all_data, trainAsTest=False):
    """
    一个商店分7天模型
    :param shopid:
    :param all_data:
    :param trainAsTest: 是否把训练集后14天当作测试集
    :return:如果trainAsTset是True,则返回[predicts,reals],否则返回[predicts,None]
    """
    part_data = all_data[all_data.shopid == shopid]
    last_14_real_y = None
    # 取出一部分做训练集
    if trainAsTest: #使用训练集后14天作为测试集的话,训练集为前面部分
        part_data = part_data[0:len(part_data) - 14]
        last_14_real_y = part_data[len(part_data) - 14:]["count"].values

    skipNum = 0
    sameday = extractBackSameday(part_data, 2, skipNum, nan_method_sameday_mean)
    count = extractCount(part_data, skipNum)
    model = []
    part_counts = []
    #lr model fit
    for i in range(7):
        lr = LinearRegression()
        model.append(lr)
        weekday = i + 1
        part_sameday = getOneWeekdayFomExtractedData(sameday, weekday)
        part_count = getOneWeekdayFomExtractedData(count, weekday)
        part_counts.append(part_count)
        lr.fit(part_sameday, part_count)

    #lr model predict
    format = "%Y-%m-%d"
    if trainAsTest:
        startTime = datetime.datetime.strptime("2016-10-18", format)
    else:
        startTime = datetime.datetime.strptime("2016-11-1", format)
    timedelta = datetime.timedelta(1)
    preficts = []
    for i in range(14):
        currentTime = startTime + timedelta * i
        strftime = currentTime.strftime(format)
        index = getWeekday(strftime) - 1
        part_count = part_counts[index]
        #取前2周同一天的值为特征进行预测
        x = [[part_count[len(part_count) - 1][0], part_count[len(part_count) - 2][0]]]
        predict = model[index].predict(x)
        preficts.append(predict[0][0])
        part_counts[index] = np.append(part_count, predict).reshape((part_count.shape[0] + 1, 1))
    preficts = (removeNegetive(toInt(np.array(preficts)))).astype(int)
    if trainAsTest:
        last_14_real_y = (removeNegetive(toInt(np.array(last_14_real_y)))).astype(int)
        # print preficts,last_14_real_y
        print str(shopid)+',score:', scoreoneshop(preficts, last_14_real_y)
    return [preficts, last_14_real_y]
示例#2
0
def computeScore(filePath,
                 scoreFilePath,
                 threshold=0.06,
                 needRefuseDataPath=None,
                 refuseDataPath=None,
                 refuseDataSavePath=None):

    train_predict = np.loadtxt(filePath, dtype=int, delimiter=",")
    shopids = train_predict.take(0, axis=1).tolist()
    predicts = np.ndarray(0)
    reals = np.ndarray(0)
    good = []
    bad = []
    scores = []
    for k in range(len(shopids)):
        id = shopids[k]
        predict = train_predict[k][1:15]
        real = train_predict[k][15:29]
        predicts = np.append(predicts, predict)
        reals = np.append(reals, real)
        score_one = scoreoneshop(predict, real)
        print id, ":", score_one
        if (score_one < threshold):
            good.append(id)
        else:
            bad.append(id)
        scores.append(score_one)
    print "last score:", score(predicts, reals)
    print "good", good, len(good)
    print "bad", bad, len(bad)

    if scoreFilePath is not None:
        result = np.reshape(scores, (len(shopids), 1))
        result = np.insert(result, 0, shopids, axis=1)
        np.savetxt(scoreFilePath, (result), delimiter=",", fmt="%.6f")

    if needRefuseDataPath is not None:
        needRefuseData = np.loadtxt(needRefuseDataPath,
                                    dtype=int,
                                    delimiter=",")
        refuseData = np.loadtxt(refuseDataPath, dtype=int, delimiter=",")
        refuse_data = np.zeros((len(shopids), 14))
        for i in range(len(shopids)):
            shopid = i + 1
            if shopid in good:
                value = needRefuseData[i][1:15]
            elif shopid in bad:
                value = refuseData[i][1:15]
            refuse_data[i] = value
        refuse_data = np.insert(refuse_data, 0, shopids, axis=1).astype(int)
        np.savetxt(refuseDataSavePath, refuse_data, delimiter=",", fmt='%d')
示例#3
0
def predictOneTrain_SRN(shopid, all_data, trainAsTest=False):
    """
    用SRN预测某一个商店
    :param shopid: 预测商店id
    :param trainAsTest: 是否使用训练集后14天作为测试集
    :return:
    """
    all_countList = getCoutList(all_data, shopId=shopid)
    seq_length = 14
    # 取出一部分做训练集
    if trainAsTest: #使用训练集后14天作为测试集的话,训练集为前面部分
        part_countList = all_countList[0:len(all_countList) - 14]
    else:
        part_countList = all_countList
    train_x, train_y = preprocessCoutList(seq_length, part_countList)

    # test_coutList = all_countList[len(all_countList) - 2*seq_length:len(all_countList)]
    # [test_x, test_y] = preprocessCoutList(seq_length, test_coutList)

    model = Sequential()
    model.add(LSTM(32, input_shape=(train_x.shape[1], train_x.shape[2]), activation="tanh")) #sigmoid
    model.add(Dense(1, activation='linear'))
    #, W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)

    # 设置优化器(除了学习率外建议保持其他参数不变)
    rms=RMSprop(lr=0.03)
    # sgd=SGD(lr=0.1, momentum=0.9, nesterov=True)
    model.compile(loss=my_loss, optimizer=rms)
    print model.summary()
    model.fit(train_x, train_y, nb_epoch=rnn_nb_epoch, batch_size=1, verbose=2)

    last = train_y[len(train_y) - 1]
    last_x = train_x[train_x.shape[0] - 1]
    if trainAsTest:
        last_14_real_y = all_countList[len(all_countList) - 14:]
    #预测值
    prediction_y=[]
    for i in range(14):
        new_x = last_x[1:].copy()
        new_x = np.concatenate((new_x, [[last]]))
        new_x = np.reshape(new_x, (1, 14, 1))
        print new_x
        last = model.predict(new_x)[0][0]
        print last
        prediction_y.append(last)
        last_x = new_x[0]
    prediction_y = (removeNegetive(toInt(np.array(prediction_y)))).astype(int)
    if trainAsTest:
        print str(shopid)+',score:', scoreoneshop(prediction_y, np.array(last_14_real_y))
    return [prediction_y, shopid]
def predictAllShop_LC_HPS(all_data,
                          trainAsTest=False,
                          saveFilePath=None,
                          featurePath=None,
                          cate_level=0,
                          cate_name=None,
                          featureSavePath=None,
                          needSaveFeature=False,
                          ignore_shopids=[],
                          needCV=False,
                          model_path=None,
                          Augmented=False,
                          ignore_get_train=True,
                          ignore_predict=True,
                          addNoiseInResult=False,
                          time=1):
    """
    通过gridsearch找超参数
    :param trainAsTest: 是否使用训练集后14天作为测试集
    :param saveFilePath
    :param featurePath:
    :param cate_level:
    :param cate_name:
    :param featureSavePath:
    :param needSaveFeature:
    :param ignore_shopids:
    :param create_model_function:
    :param needCV
    :param Augmented:是否增广样本
    :param  ignore_get_train:是否忽略获取样本
    :param ignore_predict:是否忽略预测
    :return:
    """

    augument_time = 1
    verbose = 2
    last_N_days = 60
    #记录已经被忽略的商店数量
    # ignores = 0
    shop_need_to_predict = 2000
    if (cate_level is 0):
        shopids = np.arange(1, 1 + shop_need_to_predict, 1)
    else:
        shopids = Parameter.extractShopValueByCate(cate_level, cate_name)
    shop_info = pd.read_csv(Parameter.shopinfopath,
                            names=[
                                "shopid", "cityname", "locationid", "perpay",
                                "score", "comment", "level", "cate1", "cate2",
                                "cate3"
                            ])

    weather = False
    weekOrWeekend = False
    day_back_num = 21
    sameday_backNum = 0
    week_backnum = 3
    other_features = [statistic_functon_mean, statistic_functon_median]
    other_features = []
    shop_features = ["perpay", "comment", "score", "level"]
    shop_features = []
    #是否是周末hot_encoder
    hot_encoder = onehot([[1], [0]])
    #类别1hot_encoder
    cate1_list = np.unique(shop_info['cate1'])
    cate1_label_encoder = labelEncoder(cate1_list)
    cate1_list2 = cate1_label_encoder.transform(cate1_list).reshape((-1, 1))
    cate1_hot_encoder = onehot(cate1_list2)

    if featurePath is None:
        all_x = None
        all_y = None
        for shopid in shopids:
            if ignore_get_train:
                if shopid in ignore_shopids:
                    print "ignore get train", shopid
                    continue
            print "get ", shopid, " train"
            part_data = all_data[all_data.shopid == shopid]
            last_14_real_y = None
            # 取出一部分做训练集
            if trainAsTest:  #使用训练集后14天作为测试集的话,训练集为前面部分
                last_14_real_y = part_data[len(part_data) -
                                           14:]["count"].values
                part_data = part_data[0:len(part_data) - 14]
            # print last_14_real_y
            '''确定跳过前面多少天的数据'''
            skipNum = part_data.shape[0] - last_N_days
            if skipNum < 0:
                skipNum = 0
            train_x = None
            '''获取特征'''
            if sameday_backNum != 0:  #sameday
                sameday = extractBackSameday(part_data, sameday_backNum,
                                             skipNum, nan_method_sameday_mean)
                train_x = getOneWeekdayFomExtractedData(sameday)
            if day_back_num != 0:  #day
                if train_x is not None:
                    train_x = np.concatenate(
                        (train_x,
                         getOneWeekdayFomExtractedData(
                             extractBackDay(part_data, day_back_num, skipNum,
                                            nan_method_sameday_mean))),
                        axis=1)
                else:
                    train_x = getOneWeekdayFomExtractedData(
                        extractBackDay(part_data, day_back_num, skipNum,
                                       nan_method_sameday_mean))
            if weekOrWeekend:  #weekOrWeekend
                ws = getOneWeekdayFomExtractedData(
                    extractWorkOrWeekend(part_data, skipNum))
                train_x = np.concatenate((train_x, hot_encoder.transform(ws)),
                                         axis=1)

            count = extractCount(part_data, skipNum)
            train_y = getOneWeekdayFomExtractedData(count)
            for feature in other_features:
                value = getOneWeekdayFomExtractedData(
                    extractBackWeekValue(part_data, week_backnum, skipNum,
                                         nan_method_sameday_mean, feature))
                train_x = np.append(train_x, value, axis=1)
            '''添加商家信息'''
            # print train_x,train_x.shape
            index = shopid - 1
            oneshopinfo = shop_info.ix[index]
            shop_city = oneshopinfo['cityname']
            shop_perpay = oneshopinfo['perpay'] if not pd.isnull(
                oneshopinfo['perpay']) else 0
            shop_score = oneshopinfo['score'] if not pd.isnull(
                oneshopinfo['score']) else 0
            shop_comment = oneshopinfo['comment'] if not pd.isnull(
                oneshopinfo['comment']) else 0
            shop_level = oneshopinfo['level'] if not pd.isnull(
                oneshopinfo['level']) else 0
            shop_cate1 = oneshopinfo['cate1']
            import warnings
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", category=DeprecationWarning)
                shop_cate1_encoder = cate1_hot_encoder.transform(
                    cate1_label_encoder.transform([shop_cate1]))
            if "perpay" in shop_features:
                train_x = np.insert(train_x,
                                    train_x.shape[1],
                                    shop_perpay,
                                    axis=1)
            if "score" in shop_features:
                train_x = np.insert(train_x,
                                    train_x.shape[1],
                                    shop_score,
                                    axis=1)
            if "comment" in shop_features:
                train_x = np.insert(train_x,
                                    train_x.shape[1],
                                    shop_comment,
                                    axis=1)
            if "level" in shop_features:
                train_x = np.insert(train_x,
                                    train_x.shape[1],
                                    shop_level,
                                    axis=1)
            if "cate1" in shop_features:
                for i in range(shop_cate1_encoder.shape[1]):
                    train_x = np.insert(train_x,
                                        train_x.shape[1],
                                        shop_cate1_encoder[0][i],
                                        axis=1)
            '''商家信息添加完毕'''
            '''天气特征'''
            if weather:
                weathers = getOneWeekdayFomExtractedData(
                    extractWeatherInfo(part_data, skipNum, shop_city))
                train_x = np.append(train_x, weathers, axis=1)
            '''天气特征结束'''

            if all_x is None:
                all_x = train_x
                all_y = train_y
            else:
                all_x = np.insert(all_x, all_x.shape[0], train_x, axis=0)
                all_y = np.insert(all_y, all_y.shape[0], train_y, axis=0)

                # '''添加周几'''
                # extract_weekday = getOneWeekdayFomExtractedData(extractWeekday(part_data, skipNum))
                # train_x = np.append(train_x, extract_weekday, axis=1)
                # ''''''

                # train_x = train_x.reshape((train_x.shape[0],
                #                            train_x.shape[1], 1))
                # print model.get_weights()
                # part_counts = []
                # for i in range(7):
                #     weekday = i + 1
                #     part_count = getOneWeekdayFomExtractedData(count, weekday)
                #     part_counts.append(part_count)

        train_x = all_x
        train_y = all_y
        """增广训练集"""
        if Augmented:
            print "augment data"
            new_train_x = np.ndarray(
                (train_x.shape[0] * (augument_time + 1), train_x.shape[1]))
            new_train_y = np.ndarray(
                (train_y.shape[0] * (augument_time + 1), train_y.shape[1]))

            def augument_relu(v):  # 高斯增广。。。。似乎效果不太好,极大可能改变样本
                return v * (1 + 0.01 * np.random.normal())

            def augument_relu2(v):
                return v * 1.05

            end = train_x.shape[0]
            for index in range(end):
                new_train_x[index] = train_x[index]
                new_train_y[index] = train_y[index]
            sert_index = index + 1
            for index in range(end):
                print "%d / %d" % (index, end)
                for t in range(augument_time):
                    new_train_x[sert_index] = train_x[index]
                    # train_x = np.concatenate((train_x, [train_x[index]]), axis=0)
                    # print train_x
                    ov = train_y[index][0]
                    # train_y = np.concatenate((train_y, [[augument_relu(ov)]]), axis=0)
                    new_train_y[sert_index] = [augument_relu2(ov)]
                    sert_index += 1
                    # print train_y
            print "augment finish"
            train_x = new_train_x
            train_y = new_train_y

        if needSaveFeature:
            featureAndLabel = np.concatenate((train_x, train_y), axis=1)
            flDF = pd.DataFrame(featureAndLabel)
            if featureSavePath is None:
                if trainAsTest:
                    featureSavePath = Parameter.projectPath + "lzj/train_feature/%dCatelevel_%sCatename_%dfeatures_%dSameday_%dDay_%dLast" % (
                        cate_level, cate_name, flDF.shape[1] - 1,
                        sameday_backNum, day_back_num, last_N_days)
                else:
                    featureSavePath = Parameter.projectPath + "lzj/feature/%dCatelevel_%sCatename_%dfeatures_%dSameday_%dDay_%dLast" % (
                        cate_level, cate_name, flDF.shape[1] - 1,
                        sameday_backNum, day_back_num, last_N_days)
            if Augmented:
                featureSavePath += ("_Augment%d" % augument_time)

            featureSavePath += ".csv"
            print "save feature in :", featureSavePath
            flDF.to_csv(featureSavePath)
    else:  #有featurePath文件
        if trainAsTest:
            path = Parameter.projectPath + "lzj/train_feature/" + featurePath
        else:
            path = Parameter.projectPath + "lzj/feature/" + featurePath
        flDF = pd.read_csv(path, index_col=0)
        train_x = flDF.values[:, :-1]
        train_y = flDF.values[:, -1:]
        # print train_x
        # print train_y
    '''将t标准化'''
    x_scaler = MinMaxScaler().fit(train_x)
    y_scaler = MinMaxScaler().fit(train_y)
    train_x = x_scaler.transform(train_x)
    train_y = y_scaler.transform(train_y)
    '''标准化结束'''
    """CNN"""
    train_x = np.reshape(train_x, (train_x.shape[0], train_x.shape[1], 1))

    if model_path is None:
        if needCV:
            '''gridsearchCV'''
            # nb_epoch=rnn_epoch, batch_size=batch_size, verbose=verbose
            # input_dim, h1_unit = 16, optimizer = "adagrad", init = "normal"):
            input_dim = [(train_x.shape[1], train_x.shape[2])]
            h1_acqtivation = ["relu"]
            h1_unit = [8, 12, 16, 20]
            model = KerasRegressor(build_fn=create_model_LocallyConnected,
                                   verbose=verbose)
            batch_size = [3, 5, 7, 10]
            epochs = [10, 15, 20, 25, 30]
            param_grid = dict(batch_size=batch_size,
                              nb_epoch=epochs,
                              h1_unit=h1_unit,
                              input_shape=input_dim)
            grid = GridSearchCV(estimator=model,
                                param_grid=param_grid,
                                n_jobs=-1,
                                scoring="neg_mean_squared_error")
            grid.refit = False
            grid_result = grid.fit(train_x, train_y)

            print("Best: %f using %s" %
                  (grid_result.best_score_, grid_result.best_params_))
            for params, mean_score, scores in grid_result.grid_scores_:
                print("%f (%f) with: %r" %
                      (scores.mean(), scores.std(), params))

        if not needCV:
            input_dim = (train_x.shape[1], train_x.shape[2])
            h1_unit = 16 + (time) * 4
            h1_activation = "sigmoid"
            batch_size = 3
            epochs = 40

        else:
            input_dim = (train_x.shape[1], train_x.shape[2])
            epochs = grid_result.best_params_['nb_epoch']
            batch_size = grid_result.best_params_['batch_size']
            h1_unit = grid_result.best_params_["h1_unit"]
            h1_activation = "sigmoid"

        early_stopping = EarlyStopping(monitor='val_loss', patience=2)
        best_model = create_model_LocallyConnected(input_shape=input_dim,
                                                   h1_unit=h1_unit,
                                                   h1_activation=h1_activation)
        hist = best_model.fit(train_x,
                              train_y,
                              verbose=verbose,
                              batch_size=batch_size,
                              nb_epoch=epochs,
                              validation_split=0.1,
                              callbacks=[early_stopping])
        print hist.history

        #保存模型
        if trainAsTest:
            model_save_path = Parameter.projectPath+"lzj/train_model/" + \
                              "%dlast_%ds_%dd_%df_%d_%s_%d_%d_%d_%s.json" \
                              % (last_N_days,sameday_backNum, day_back_num, train_x.shape[1], cate_level, cate_name
                                 , epochs, batch_size, h1_unit, h1_activation)
            saveModel(model_save_path, best_model)
        else:
            model_save_path = Parameter.projectPath+"lzj/model/" + \
                              "%dlast_%ds_%dd_%df_%d_%s_%d_%d_%d_%s.json" \
                              % (last_N_days,sameday_backNum, day_back_num, train_x.shape[1], cate_level, cate_name
                                 ,  epochs, batch_size, h1_unit, h1_activation)
            saveModel(model_save_path, best_model)
    else:  #model_path is not none
        print "get model from " + model_path
        best_model = getModel(model_path)

    format = "%Y-%m-%d"
    if trainAsTest:
        startTime = datetime.datetime.strptime("2016-10-18", format)
    else:
        startTime = datetime.datetime.strptime("2016-11-1", format)
    timedelta = datetime.timedelta(1)
    '''预测商家'''
    model = best_model
    preficts_all = None
    real_all = None

    for j in shopids:
        if ignore_predict:
            if j in ignore_shopids:
                print "ignore predict", j
                # ignores += 1
                continue
        print "predict:", j
        preficts = []
        part_data = all_data[all_data.shopid == j]
        last_14_real_y = None

        if trainAsTest:  #使用训练集后14天作为测试集的话,训练集为前面部分
            last_14_real_y = part_data[len(part_data) - 14:]["count"].values
            part_data = part_data[0:len(part_data) - 14]
        '''预测14天'''
        for i in range(14):
            currentTime = startTime + timedelta * i
            strftime = currentTime.strftime(format)
            # index = getWeekday(strftime) - 1
            # part_count = part_counts[index]
            #取前{sameday_backNum}周同一天的值为特征进行预测
            part_data = part_data.append(
                {
                    "count": 0,
                    "shopid": j,
                    "time": strftime,
                    "weekday": getWeekday(strftime)
                },
                ignore_index=True)
            x = None
            if sameday_backNum != 0:
                x = getOneWeekdayFomExtractedData(
                    extractBackSameday(part_data, sameday_backNum,
                                       part_data.shape[0] - 1,
                                       nan_method_sameday_mean))
            if day_back_num != 0:
                if x is None:
                    x = getOneWeekdayFomExtractedData(
                        extractBackDay(part_data, day_back_num,
                                       part_data.shape[0] - 1,
                                       nan_method_sameday_mean))
                else:
                    x = np.concatenate(
                        (x,
                         getOneWeekdayFomExtractedData(
                             extractBackDay(part_data, day_back_num,
                                            part_data.shape[0] - 1,
                                            nan_method_sameday_mean))),
                        axis=1)
            if weekOrWeekend:
                x = np.concatenate(
                    (x,
                     hot_encoder.transform(
                         getOneWeekdayFomExtractedData(
                             extractWorkOrWeekend(part_data,
                                                  part_data.shape[0] - 1)))),
                    axis=1)

            for feature in other_features:
                x_value = getOneWeekdayFomExtractedData(
                    extractBackWeekValue(part_data, week_backnum,
                                         part_data.shape[0] - 1,
                                         nan_method_sameday_mean, feature))
                x = np.append(x, x_value, axis=1)
            # '''添加周几'''
            # x = np.append(x, getOneWeekdayFomExtractedData(extractWeekday(part_data, part_data.shape[0]-1)), axis=1)
            # ''''''
            '''添加商家信息'''
            index = j - 1
            oneshopinfo = shop_info.ix[index]
            shop_city = oneshopinfo["cityname"]
            shop_perpay = oneshopinfo['perpay'] if not pd.isnull(
                oneshopinfo['perpay']) else 0
            shop_score = oneshopinfo['score'] if not pd.isnull(
                oneshopinfo['score']) else 0
            shop_comment = oneshopinfo['comment'] if not pd.isnull(
                oneshopinfo['comment']) else 0
            shop_level = oneshopinfo['level'] if not pd.isnull(
                oneshopinfo['level']) else 0
            if "perpay" in shop_features:
                x = np.insert(x, x.shape[1], shop_perpay, axis=1)
            if "score" in shop_features:
                x = np.insert(x, x.shape[1], shop_score, axis=1)
            if "comment" in shop_features:
                x = np.insert(x, x.shape[1], shop_comment, axis=1)
            if "level" in shop_features:
                x = np.insert(x, x.shape[1], shop_level, axis=1)
            shop_cate1 = oneshopinfo['cate1']
            if "cate1" in shop_features:
                shop_cate1_encoder = cate1_hot_encoder.transform(
                    cate1_label_encoder.transform([shop_cate1]).reshape(
                        (-1, 1)))
                for i in range(shop_cate1_encoder.shape[1]):
                    x = np.insert(x,
                                  x.shape[1],
                                  shop_cate1_encoder[0][i],
                                  axis=1)
            '''商家信息添加完毕'''
            '''天气特征'''
            if weather:
                weathers = getOneWeekdayFomExtractedData(
                    extractWeatherInfo(part_data, part_data.shape[0] - 1,
                                       shop_city))
                x = np.append(x, weathers, axis=1)
            '''天气特征结束'''
            # for j in range(sameday_backNum):
            #     x.append(train_y[len(train_y) - (j+1)*7][0])
            # x = np.array(x).reshape((1, sameday_backNum))
            x = x_scaler.transform(x)
            """CNN"""
            x = np.reshape(x, (x.shape[0], x.shape[1], 1))
            predict = model.predict(x)
            '''将y还原'''
            if predict.ndim == 2:
                predict = y_scaler.inverse_transform(predict)[0][0]
            elif predict.ndim == 1:
                predict = y_scaler.inverse_transform(predict)[0]
            '''将y还原结束'''
            # print predict
            if (predict <= 0):
                predict == 0
            if addNoiseInResult:
                predict = predict * (
                    1 + 0.05 * abs(np.random.normal(scale=(i + 1) * 0.05)))
            preficts.append(predict)
            part_data.set_value(part_data.shape[0] - 1, "count", predict)

        preficts = (removeNegetive(toInt(np.array(preficts)))).astype(int)
        if preficts_all is None:
            preficts_all = preficts
        else:
            preficts_all = np.insert(preficts_all,
                                     preficts_all.shape[0],
                                     preficts,
                                     axis=0)

        if trainAsTest:
            last_14_real_y = (removeNegetive(toInt(
                np.array(last_14_real_y)))).astype(int)
            if real_all is None:
                real_all = last_14_real_y
            else:
                real_all = np.insert(real_all,
                                     real_all.shape[0],
                                     last_14_real_y,
                                     axis=0)
                # print preficts,last_14_real_y
            print str(j) + ',score:', scoreoneshop(preficts, last_14_real_y)

    # preficts = np.array(preficts)
    shopids = shopids.tolist()
    if ignore_predict:
        for remove_id in ignore_shopids:
            try:
                shopids.remove(remove_id)
            except:
                pass

    preficts_all = preficts_all.reshape((len(shopids), 14))
    if trainAsTest:
        real_all = real_all.reshape((len(shopids), 14))
        preficts_all = np.concatenate((preficts_all, real_all), axis=1)

    preficts_all = np.insert(preficts_all, 0, shopids, axis=1)
    if saveFilePath is not None:
        if model_path is None:
            path = saveFilePath + "%dLast_%ds_%dd_%df_%d_%s_%d_%d_%d_%s_%dshops" \
                                  % (last_N_days,sameday_backNum, day_back_num, train_x.shape[1], cate_level, cate_name
                                     ,  epochs, batch_size, h1_unit, h1_activation,len(shopids))
        else:
            import re
            r = re.compile(
                r"""/(\d+)last_(\d+)s_(\d+)d_(\d+)f_(\d+)_(\S+)_(\d+)_(\d+)_(\d+)_(\w+).json"""
            )
            m = r.search(model_path)
            path = saveFilePath + "%dLast_%ds_%dd_%df_%d_%s_%d_%d_%d_%s_%dshops" \
                                  % (int(m.group(1)),int(m.group(2)), int(m.group(3)), int(m.group(4)), int(m.group(5)), m.group(6)
                                     ,  int(m.group(7)), int(m.group(8)), int(m.group(9)), m.group(10),len(shopids))
        if Augmented:
            path += "_augmented"
        if addNoiseInResult:
            path += "_addNoiseInResult"
        if trainAsTest:
            path = path + "_train"
        path = path + "_%dtime.csv" % time

        print "save in :", path
        np.savetxt(path, preficts_all, fmt="%d", delimiter=",")
    return preficts_all
示例#5
0
def predictAllShop_MultiCNN_HPS(all_data,
                                trainAsTest=False,
                                saveFilePath=None,
                                featurePath=None,
                                cate_level=0,
                                cate_name=None,
                                featureSavePath=None,
                                needSaveFeature=False,
                                ignore_shopids=[],
                                needCV=False,
                                model_path=None,
                                Augmented=False,
                                ignore_get_train=True,
                                ignore_predict=True,
                                addNoiseInResult=False,
                                time=1):
    """
    通过gridsearch找超参数
    :param trainAsTest: 是否使用训练集后14天作为测试集
    :param saveFilePath
    :param featurePath:
    :param cate_level:
    :param cate_name:
    :param featureSavePath:
    :param needSaveFeature:
    :param ignore_shopids:
    :param create_model_function:
    :param needCV
    :param Augmented:是否增广样本
    :param  ignore_get_train:是否忽略获取样本
    :param ignore_predict:是否忽略预测
    :return:
    """

    augument_time = 1
    verbose = 2
    last_N_days = 70
    #记录已经被忽略的商店数量
    # ignores = 0
    shop_need_to_predict = 2000
    if (cate_level is 0):
        shopids = np.arange(1, 1 + shop_need_to_predict, 1)
    else:
        shopids = Parameter.extractShopValueByCate(cate_level, cate_name)
    shop_info = pd.read_csv(Parameter.shopinfopath,
                            names=[
                                "shopid", "cityname", "locationid", "perpay",
                                "score", "comment", "level", "cate1", "cate2",
                                "cate3"
                            ])

    weather = False
    weekOrWeekend = False
    day_back_num = 21
    sameday_backNum = 8
    week_backnum = 3
    other_features = [statistic_functon_mean, statistic_functon_median]
    other_features = []
    shop_features = ["perpay", "comment", "score", "level"]
    shop_features = []
    #是否是周末hot_encoder
    hot_encoder = onehot([[1], [0]])
    #类别1hot_encoder
    cate1_list = np.unique(shop_info['cate1'])
    cate1_label_encoder = labelEncoder(cate1_list)
    cate1_list2 = cate1_label_encoder.transform(cate1_list).reshape((-1, 1))
    cate1_hot_encoder = onehot(cate1_list2)

    if featurePath is None:
        train_x, train_y = getTrainXY(
            all_data, cate1_hot_encoder, cate1_label_encoder, day_back_num,
            hot_encoder, ignore_get_train, ignore_shopids, last_N_days,
            other_features, 0, shop_features, shop_info, shopids, trainAsTest,
            weather, weekOrWeekend, week_backnum)
        train_x2 = getTrainXY(all_data, cate1_hot_encoder, cate1_label_encoder,
                              0, hot_encoder, ignore_get_train, ignore_shopids,
                              last_N_days, other_features, sameday_backNum,
                              shop_features, shop_info, shopids, trainAsTest,
                              weather, weekOrWeekend, week_backnum)[0]
        """增广训练集"""
        if Augmented:
            train_xs, train_y = augmentTrainX(augument_time,
                                              [train_x, train_x2], train_y)

        train_x = train_xs[0]
        train_x2 = train_xs[1]

        if needSaveFeature:
            featureAndLabel = np.concatenate((train_x, train_y), axis=1)
            flDF = pd.DataFrame(featureAndLabel)
            if featureSavePath is None:
                if trainAsTest:
                    featureSavePath = Parameter.projectPath + "lzj/train_feature/%dCatelevel_%sCatename_%dfeatures_%dSameday_%dDay_%dLast" % (
                        cate_level, cate_name, flDF.shape[1] - 1,
                        sameday_backNum, day_back_num, last_N_days)
                else:
                    featureSavePath = Parameter.projectPath + "lzj/feature/%dCatelevel_%sCatename_%dfeatures_%dSameday_%dDay_%dLast" % (
                        cate_level, cate_name, flDF.shape[1] - 1,
                        sameday_backNum, day_back_num, last_N_days)
            if Augmented:
                featureSavePath += ("_Augment%d" % augument_time)

            featureSavePath += ".csv"
            print "save feature in :", featureSavePath
            flDF.to_csv(featureSavePath)
    else:  #有featurePath文件
        if trainAsTest:
            path = Parameter.projectPath + "lzj/train_feature/" + featurePath
        else:
            path = Parameter.projectPath + "lzj/feature/" + featurePath
        flDF = pd.read_csv(path, index_col=0)
        train_x = flDF.values[:, :-1]
        train_y = flDF.values[:, -1:]
        # print train_x
        # print train_y
    '''将t标准化'''
    x_scaler = MinMaxScaler().fit(train_x)
    x2_scaler = MinMaxScaler().fit(train_x2)
    y_scaler = MinMaxScaler().fit(train_y)
    train_x = x_scaler.transform(train_x)
    train_x2 = x2_scaler.transform(train_x2)
    train_y = y_scaler.transform(train_y)
    '''标准化结束'''
    """CNN"""
    train_x = np.reshape(train_x, (train_x.shape[0], train_x.shape[1], 1))
    train_x2 = np.reshape(train_x2, (train_x2.shape[0], train_x2.shape[1], 1))

    if model_path is None:
        if needCV:
            '''gridsearchCV'''
            # nb_epoch=rnn_epoch, batch_size=batch_size, verbose=verbose
            # input_dim, h1_unit = 16, optimizer = "adagrad", init = "normal"):
            input_dim = [(train_x.shape[1], train_x.shape[2])]
            input_dim2 = [(train_x2.shape[1], train_x2.shape[2])]
            h1_acqtivation = ["relu"]
            h1_unit = [8, 12, 16, 20]
            model = KerasRegressor(build_fn=create_model_MultiCNN,
                                   verbose=verbose)
            batch_size = [3, 5, 7, 10]
            epochs = [10, 15, 20, 25, 30]
            param_grid = dict(batch_size=batch_size,
                              nb_epoch=epochs,
                              h1_unit=h1_unit,
                              input_shape1=input_dim,
                              input_shape2=input_dim2)
            grid = GridSearchCV(estimator=model,
                                param_grid=param_grid,
                                n_jobs=-1,
                                scoring="neg_mean_squared_error")
            grid.refit = False
            grid_result = grid.fit(train_x, train_y)

            print("Best: %f using %s" %
                  (grid_result.best_score_, grid_result.best_params_))
            for params, mean_score, scores in grid_result.grid_scores_:
                print("%f (%f) with: %r" %
                      (scores.mean(), scores.std(), params))

        if not needCV:
            input_dim = (train_x.shape[1], train_x.shape[2])
            input_dim2 = (train_x2.shape[1], train_x2.shape[2])
            # h1_unit = 16 + (time) * 4
            h1_unit = 24
            h1_activation = "relu"
            batch_size = 3
            epochs = 40

        else:
            input_dim = (train_x.shape[1], train_x.shape[2])
            input_dim2 = (train_x2.shape[1], train_x2.shape[2])
            epochs = grid_result.best_params_['nb_epoch']
            batch_size = grid_result.best_params_['batch_size']
            h1_unit = grid_result.best_params_["h1_unit"]
            h1_activation = "sigmoid"

        print train_x.shape
        print train_x2.shape
        print train_y.shape

        early_stopping = EarlyStopping(monitor='val_loss', patience=2)
        best_model = create_model_MultiCNN(input_shape1=input_dim,
                                           input_shape2=input_dim2,
                                           h1_unit=h1_unit,
                                           h1_activation=h1_activation)
        hist = best_model.fit([train_x, train_x2],
                              train_y,
                              verbose=verbose,
                              batch_size=batch_size,
                              nb_epoch=epochs,
                              validation_split=0.1,
                              callbacks=[early_stopping])
        print hist.history

        #保存模型
        if trainAsTest:
            model_save_path = Parameter.projectPath+"lzj/train_model/" + \
                              "%dlast_%ds_%dd_%df_%d_%s_%d_%d_%d_%s.json" \
                              % (last_N_days,sameday_backNum, day_back_num, train_x.shape[1], cate_level, cate_name
                                 , epochs, batch_size, h1_unit, h1_activation)
            saveModel(model_save_path, best_model)
        else:
            model_save_path = Parameter.projectPath+"lzj/model/" + \
                              "%dlast_%ds_%dd_%df_%d_%s_%d_%d_%d_%s.json" \
                              % (last_N_days,sameday_backNum, day_back_num, train_x.shape[1], cate_level, cate_name
                                 ,  epochs, batch_size, h1_unit, h1_activation)
            saveModel(model_save_path, best_model)
    else:  #model_path is not none
        print "get model from " + model_path
        best_model = getModel(model_path)

    format = "%Y-%m-%d"
    if trainAsTest:
        startTime = datetime.datetime.strptime("2016-10-18", format)
    else:
        startTime = datetime.datetime.strptime("2016-11-1", format)
    timedelta = datetime.timedelta(1)
    '''预测商家'''
    model = best_model
    preficts_all = None
    real_all = None

    for j in shopids:
        if ignore_predict:
            if j in ignore_shopids:
                print "ignore predict", j
                # ignores += 1
                continue
        print "predict:", j
        preficts = []
        part_data = all_data[all_data.shopid == j]
        last_14_real_y = None

        if trainAsTest:  #使用训练集后14天作为测试集的话,训练集为前面部分
            last_14_real_y = part_data[len(part_data) - 14:]["count"].values
            part_data = part_data[0:len(part_data) - 14]
        '''预测14天'''
        for i in range(14):
            currentTime = startTime + timedelta * i
            strftime = currentTime.strftime(format)
            # index = getWeekday(strftime) - 1
            # part_count = part_counts[index]
            #取前{sameday_backNum}周同一天的值为特征进行预测
            part_data = part_data.append(
                {
                    "count": 0,
                    "shopid": j,
                    "time": strftime,
                    "weekday": getWeekday(strftime)
                },
                ignore_index=True)
            x = None
            x2 = None
            x = getOneShopTrainXY(cate1_hot_encoder, cate1_label_encoder,
                                  day_back_num, hot_encoder, other_features,
                                  part_data, 0, shop_features, shop_info, j,
                                  part_data.shape[0] - 1, x, weather,
                                  weekOrWeekend, week_backnum)[0]
            x2 = getOneShopTrainXY(cate1_hot_encoder, cate1_label_encoder, 0,
                                   hot_encoder, other_features, part_data,
                                   sameday_backNum, shop_features, shop_info,
                                   j, part_data.shape[0] - 1, x2, weather,
                                   weekOrWeekend, week_backnum)[0]

            x = x_scaler.transform(x)
            x2 = x2_scaler.transform(x2)
            """CNN"""
            x = np.reshape(x, (x.shape[0], x.shape[1], 1))
            x2 = np.reshape(x2, (x2.shape[0], x2.shape[1], 1))
            predict = model.predict([x, x2])
            '''将y还原'''
            if predict.ndim == 2:
                predict = y_scaler.inverse_transform(predict)[0][0]
            elif predict.ndim == 1:
                predict = y_scaler.inverse_transform(predict)[0]
            '''将y还原结束'''
            # print predict
            if (predict <= 0):
                predict == 0
            if addNoiseInResult:
                predict = predict * (
                    1 + 0.05 * abs(np.random.normal(scale=(i + 1) * 0.05)))
            preficts.append(predict)
            part_data.set_value(part_data.shape[0] - 1, "count", predict)

        preficts = (removeNegetive(toInt(np.array(preficts)))).astype(int)
        if preficts_all is None:
            preficts_all = preficts
        else:
            preficts_all = np.insert(preficts_all,
                                     preficts_all.shape[0],
                                     preficts,
                                     axis=0)

        if trainAsTest:
            last_14_real_y = (removeNegetive(toInt(
                np.array(last_14_real_y)))).astype(int)
            if real_all is None:
                real_all = last_14_real_y
            else:
                real_all = np.insert(real_all,
                                     real_all.shape[0],
                                     last_14_real_y,
                                     axis=0)
                # print preficts,last_14_real_y
            print str(j) + ',score:', scoreoneshop(preficts, last_14_real_y)

    # preficts = np.array(preficts)
    shopids = shopids.tolist()
    if ignore_predict:
        for remove_id in ignore_shopids:
            try:
                shopids.remove(remove_id)
            except:
                pass

    preficts_all = preficts_all.reshape((len(shopids), 14))
    if trainAsTest:
        real_all = real_all.reshape((len(shopids), 14))
        preficts_all = np.concatenate((preficts_all, real_all), axis=1)

    preficts_all = np.insert(preficts_all, 0, shopids, axis=1)
    if saveFilePath is not None:
        if model_path is None:
            path = saveFilePath + "%dLast_%ds_%dd_%df_%d_%s_%d_%d_%d_%s_%dshops" \
                                  % (last_N_days,sameday_backNum, day_back_num, train_x.shape[1], cate_level, cate_name
                                     ,  epochs, batch_size, h1_unit, h1_activation,len(shopids))
        else:
            import re
            r = re.compile(
                r"""/(\d+)last_(\d+)s_(\d+)d_(\d+)f_(\d+)_(\S+)_(\d+)_(\d+)_(\d+)_(\w+).json"""
            )
            m = r.search(model_path)
            path = saveFilePath + "%dLast_%ds_%dd_%df_%d_%s_%d_%d_%d_%s_%dshops" \
                                  % (int(m.group(1)),int(m.group(2)), int(m.group(3)), int(m.group(4)), int(m.group(5)), m.group(6)
                                     ,  int(m.group(7)), int(m.group(8)), int(m.group(9)), m.group(10),len(shopids))
        if Augmented:
            path += "_augmented"
        if addNoiseInResult:
            path += "_addNoiseInResult"
        path = path + "_%dtime" % time
        if trainAsTest:
            path = path + "_train"
        path += ".csv"

        print "save in :", path
        np.savetxt(path, preficts_all, fmt="%d", delimiter=",")
    return preficts_all
示例#6
0
def predictAllShop_ANN3_together(all_data, trainAsTest=False, saveFilePath = None, featurePath = None):
    """
    使用所有商家所有数据训练,预测所有商店
    :param trainAsTest: 是否使用训练集后14天作为测试集
    :param model: 某个模型
    :param featurePath
    :return:
    """
    shop_need_to_predict = 2000
    h1_activation = "relu"
    rnn_epoch = 20
    verbose = 2
    h_unit = 16
    batch_size = 5
    shop_info = pd.read_csv(Parameter.shopinfopath, names=["shopid","cityname","locationid","perpay","score","comment","level","cate1","cate2","cate3"])

    sameday_backNum = 7
    day_back_num = 14
    week_backnum = 3
    other_features = [statistic_functon_mean,statistic_functon_median]
    other_features = []
    '''将cate1 onehot'''
    cate = shop_info['cate1'].tolist()
    cate_dup = set(cate)
    cates = []
    for i in range(len(cate_dup)):
        cates.append([i])
    hot_encoder = OneHotEncoder().fit(cates)
    dicts = dict(zip(cate_dup, range(len(cate_dup))))
    cate_num = []
    for c in cate:
        cate_num.append([dicts[c]])
    '''cate1 onehot finish'''
    if featurePath is None:

        all_x = None
        all_y = None
        for shopid in range(1, 1 + shop_need_to_predict, 1):
            print "get " , shopid, " train"
            part_data = all_data[all_data.shopid == shopid]
            last_14_real_y = None
            # 取出一部分做训练集
            if trainAsTest: #使用训练集后14天作为测试集的话,训练集为前面部分
                last_14_real_y = part_data[len(part_data) - 14:]["count"].values
                part_data = part_data[0:len(part_data) - 14]
            # print last_14_real_y
            skipNum = part_data.shape[0] - 168
            if skipNum < 0:
                skipNum = 0

            sameday = extractBackSameday(part_data, sameday_backNum, skipNum, nan_method_sameday_mean)
            day = extractBackDay(part_data,day_back_num,skipNum,nan_method_sameday_mean)
            count = extractCount(part_data, skipNum)
            train_x = getOneWeekdayFomExtractedData(sameday)
            train_x = np.concatenate((train_x,getOneWeekdayFomExtractedData(day)),axis=1)
            train_y = getOneWeekdayFomExtractedData(count)
            for feature in other_features:
                value = getOneWeekdayFomExtractedData(extractBackWeekValue(part_data, week_backnum, skipNum, nan_method_sameday_mean, feature))
                train_x = np.append(train_x, value, axis=1)

            '''添加商家信息'''
            # print train_x,train_x.shape
            index = shopid - 1
            oneshopinfo = shop_info.ix[index]
            shop_perpay = oneshopinfo['perpay'] if not pd.isnull(oneshopinfo['perpay']) else 0
            shop_score = oneshopinfo['score'] if not pd.isnull(oneshopinfo['score']) else 0
            shop_comment = oneshopinfo['comment'] if not pd.isnull(oneshopinfo['comment']) else 0
            shop_level = oneshopinfo['level'] if not pd.isnull(oneshopinfo['level']) else 0
            shop_cate1 = oneshopinfo['cate1']
            import warnings
            with warnings.catch_warnings():
                warnings.simplefilter("ignore",category=DeprecationWarning)
                shop_cate1_encoder = hot_encoder.transform([dicts[shop_cate1]]).toarray()
            train_x = np.insert(train_x,train_x.shape[1],shop_perpay,axis=1)
            train_x = np.insert(train_x,train_x.shape[1],shop_score,axis=1)
            train_x = np.insert(train_x,train_x.shape[1],shop_comment,axis=1)
            train_x = np.insert(train_x,train_x.shape[1],shop_level,axis=1)
            for i in range(shop_cate1_encoder.shape[1]):
                train_x = np.insert(train_x,train_x.shape[1],shop_cate1_encoder[0][i],axis=1)
            '''商家信息添加完毕'''

            if all_x is None:
                all_x = train_x
                all_y = train_y
            else:
                all_x = np.insert(all_x,all_x.shape[0],train_x,axis=0)
                all_y = np.insert(all_y,all_y.shape[0],train_y,axis=0)

                # '''添加周几'''
                # extract_weekday = getOneWeekdayFomExtractedData(extractWeekday(part_data, skipNum))
                # train_x = np.append(train_x, extract_weekday, axis=1)
                # ''''''

                # train_x = train_x.reshape((train_x.shape[0],
                #                            train_x.shape[1], 1))
                # print model.get_weights()
                # part_counts = []
                # for i in range(7):
                #     weekday = i + 1
                #     part_count = getOneWeekdayFomExtractedData(count, weekday)
                #     part_counts.append(part_count)


        train_x = all_x
        train_y = all_y
        featureAndLabel = np.concatenate((train_x,train_y),axis=1)
        flDF = pd.DataFrame(featureAndLabel, columns=["sameday1","sameday2","sameday3","sameday4","sameday5","sameday6","sameday7","day1","day2","day3","day4","day5","day6","day7","day8","day9","day10","day11","day12","day13","day14","perpay","score","comment","level","cate1_1","cate1_2","cate1_3","cate1_4","cate1_5","cate1_6","label"])
        if trainAsTest:
            flDF.to_csv("train_feature/ann1_168_%d.csv" % train_x.shape[1])
        else:
            flDF.to_csv("feature/ann1.csv")
    else:#有featurePath文件
        flDF = pd.read_csv(featurePath,index_col=0)
        train_x = flDF.values[:,:-1]
        train_y = flDF.values[:,-1:]
        # print train_x
        # print train_y

    '''将t标准化'''
    x_scaler = MinMaxScaler().fit(train_x)
    y_scaler = MinMaxScaler().fit(train_y)
    train_x = x_scaler.transform(train_x)
    train_y = y_scaler.transform(train_y)
    '''标准化结束'''


    '''构造神经网络'''
    model = Sequential()
    model.add(Dense(h_unit, input_dim=train_x.shape[1], activation=h1_activation)) #sigmoid
    model.add(Dense(1, activation='linear'))
    sgd = SGD(0.01)
    model.compile(loss="mse", optimizer=sgd)
    # print model.summary()
    # print getrefcount(model)
    # print model.summary()
    model.fit(train_x, train_y, nb_epoch=rnn_epoch, batch_size=batch_size, verbose=verbose)

    format = "%Y-%m-%d"
    if trainAsTest:
        startTime = datetime.datetime.strptime("2016-10-18", format)
    else:
        startTime = datetime.datetime.strptime("2016-11-1", format)
    timedelta = datetime.timedelta(1)


    '''预测所有商家'''
    preficts_all = None
    real_all = None
    for j in range(1, 1 + shop_need_to_predict, 1):
        print "predict:", j
        preficts = []
        part_data = all_data[all_data.shopid == j]
        last_14_real_y = None

        if trainAsTest: #使用训练集后14天作为测试集的话,训练集为前面部分
            last_14_real_y = part_data[len(part_data) - 14:]["count"].values
            part_data = part_data[0:len(part_data) - 14]

        '''预测14天'''
        for i in range(14):
            currentTime = startTime + timedelta * i
            strftime = currentTime.strftime(format)
            # index = getWeekday(strftime) - 1
            # part_count = part_counts[index]
            #取前{sameday_backNum}周同一天的值为特征进行预测
            part_data = part_data.append({"count":0,"shopid":j,"time":strftime,"weekday":getWeekday(strftime)},ignore_index=True)
            x = getOneWeekdayFomExtractedData(extractBackSameday(part_data,sameday_backNum,part_data.shape[0] - 1, nan_method_sameday_mean))
            x = np.concatenate((x,getOneWeekdayFomExtractedData(extractBackDay(part_data,day_back_num,part_data.shape[0]-1,nan_method_sameday_mean))),axis=1)
            for feature in other_features:
                x_value = getOneWeekdayFomExtractedData(extractBackWeekValue(part_data, week_backnum, part_data.shape[0]-1, nan_method_sameday_mean, feature))
                x = np.append(x, x_value, axis=1)
            # '''添加周几'''
            # x = np.append(x, getOneWeekdayFomExtractedData(extractWeekday(part_data, part_data.shape[0]-1)), axis=1)
            # ''''''
            '''添加商家信息'''
            index = j - 1
            oneshopinfo = shop_info.ix[index]
            shop_perpay = oneshopinfo['perpay'] if not pd.isnull(oneshopinfo['perpay']) else 0
            shop_score = oneshopinfo['score'] if not pd.isnull(oneshopinfo['score']) else 0
            shop_comment = oneshopinfo['comment'] if not pd.isnull(oneshopinfo['comment']) else 0
            shop_level = oneshopinfo['level'] if not pd.isnull(oneshopinfo['level']) else 0
            shop_cate1 = oneshopinfo['cate1']
            import warnings
            with warnings.catch_warnings():
                warnings.simplefilter("ignore",category=DeprecationWarning)
                shop_cate1_encoder = hot_encoder.transform([dicts[shop_cate1]]).toarray()
            x = np.insert(x,x.shape[1],shop_perpay,axis=1)
            x = np.insert(x,x.shape[1],shop_score,axis=1)
            x = np.insert(x,x.shape[1],shop_comment,axis=1)
            x = np.insert(x,x.shape[1],shop_level,axis=1)
            for i in range(shop_cate1_encoder.shape[1]):
                x = np.insert(x,x.shape[1],shop_cate1_encoder[0][i],axis=1)
            '''商家信息添加完毕'''

            x = x_scaler.transform(x)
            # for j in range(sameday_backNum):
            #     x.append(train_y[len(train_y) - (j+1)*7][0])
            # x = np.array(x).reshape((1, sameday_backNum))

            # print x
            # x = x.reshape(1, sameday_backNum, 1)
            predict = model.predict(x)
            if predict.ndim == 2:
                predict = y_scaler.inverse_transform(predict)[0][0]
            elif predict.ndim == 1:
                predict = y_scaler.inverse_transform(predict)[0]

            if(predict <= 0):
                predict == 1
            preficts.append(predict)
            part_data.set_value(part_data.shape[0]-1, "count", predict)

        preficts = (removeNegetive(toInt(np.array(preficts)))).astype(int)
        if preficts_all is None:
            preficts_all = preficts
        else:
            preficts_all = np.insert(preficts_all,preficts_all.shape[0],preficts,axis=0)

        if trainAsTest:
            last_14_real_y = (removeNegetive(toInt(np.array(last_14_real_y)))).astype(int)
            if real_all is None:
                real_all = last_14_real_y
            else:
                real_all = np.insert(real_all,real_all.shape[0],last_14_real_y,axis=0)
                # print preficts,last_14_real_y
            print str(j)+',score:', scoreoneshop(preficts, last_14_real_y)

    # preficts = np.array(preficts)
    preficts_all = preficts_all.reshape((shop_need_to_predict,14))
    if trainAsTest:
        real_all = real_all.reshape((shop_need_to_predict,14))
        preficts_all = np.concatenate((preficts_all,real_all), axis=1)
    preficts_all = np.insert(preficts_all, 0, range(1, shop_need_to_predict+1, 1), axis=1)
    if saveFilePath is not None:
        file = saveFilePath + ("_%d_%d_%s.csv" % (rnn_epoch, h_unit, h1_activation))
        print "save in " + file
        np.savetxt(file,preficts_all,fmt="%d",delimiter=",")
    return preficts_all
示例#7
0
def predictAllShop_ANN2_HPS(all_data,
                            trainAsTest=False,
                            saveFilePath=None,
                            featurePath=None,
                            cate_level=0,
                            cate_name=None,
                            featureSavePath=None,
                            needSaveFeature=False,
                            ignore_shopids=[],
                            needCV=False,
                            model_path=None):
    """
    通过gridsearch找超参数
    :param trainAsTest: 是否使用训练集后14天作为测试集
    :param model: 某个模型
    :param saveFilePath
    :param featurePath:
    :param cate_level:
    :param cate_name:
    :param featureSavePath:
    :param needSaveFeature:
    :param ignore_shopids:
    :param create_model_function:
    :param needCV
    :return:
    """

    verbose = 2
    last_N_days = 70
    #记录已经被忽略的商店数量
    ignores = 0
    shopids = None
    shop_need_to_predict = 2000
    if (cate_level is 0):
        shopids = np.arange(1, 1 + shop_need_to_predict, 1)
    else:
        shopids = Parameter.extractShopValueByCate(cate_level, cate_name)
    shop_info = pd.read_csv(Parameter.shopinfopath,
                            names=[
                                "shopid", "cityname", "locationid", "perpay",
                                "score", "comment", "level", "cate1", "cate2",
                                "cate3"
                            ])

    weather = True
    weekOrWeekend = True
    day_back_num = 21
    sameday_backNum = 7
    week_backnum = 3
    other_features = [statistic_functon_mean, statistic_functon_median]
    other_features = []
    shop_features = ["perpay", "comment", "score", "level"]
    shop_features = []
    #是否是周末hot_encoder
    hot_encoder = onehot([[1], [0]])
    #类别1hot_encoder
    cate1_list = np.unique(shop_info['cate1'])
    cate1_label_encoder = labelEncoder(cate1_list)
    cate1_list2 = cate1_label_encoder.transform(cate1_list).reshape((-1, 1))
    cate1_hot_encoder = onehot(cate1_list2)

    if featurePath is None:
        all_x = None
        all_y = None
        for shopid in shopids:
            if shopid in ignore_shopids:
                print "ignore get train", shopid
                ignores += 1
                continue
            print "get ", shopid, " train"
            part_data = all_data[all_data.shopid == shopid]
            last_14_real_y = None
            # 取出一部分做训练集
            if trainAsTest:  #使用训练集后14天作为测试集的话,训练集为前面部分
                last_14_real_y = part_data[len(part_data) -
                                           14:]["count"].values
                part_data = part_data[0:len(part_data) - 14]
            # print last_14_real_y
            '''确定跳过前面多少天的数据'''
            skipNum = part_data.shape[0] - last_N_days
            if skipNum < 0:
                skipNum = 0
            train_x = None
            '''获取特征'''
            if sameday_backNum != 0:  #sameday
                sameday = extractBackSameday(part_data, sameday_backNum,
                                             skipNum, nan_method_sameday_mean)
                train_x = getOneWeekdayFomExtractedData(sameday)
            if day_back_num != 0:  #day
                if train_x is not None:
                    train_x = np.concatenate(
                        (train_x,
                         getOneWeekdayFomExtractedData(
                             extractBackDay(part_data, day_back_num, skipNum,
                                            nan_method_sameday_mean))),
                        axis=1)
                else:
                    train_x = getOneWeekdayFomExtractedData(
                        extractBackDay(part_data, day_back_num, skipNum,
                                       nan_method_sameday_mean))
            if weekOrWeekend:  #weekOrWeekend
                ws = getOneWeekdayFomExtractedData(
                    extractWorkOrWeekend(part_data, skipNum))
                train_x = np.concatenate((train_x, hot_encoder.transform(ws)),
                                         axis=1)

            count = extractCount(part_data, skipNum)
            train_y = getOneWeekdayFomExtractedData(count)
            for feature in other_features:
                value = getOneWeekdayFomExtractedData(
                    extractBackWeekValue(part_data, week_backnum, skipNum,
                                         nan_method_sameday_mean, feature))
                train_x = np.append(train_x, value, axis=1)
            '''添加商家信息'''
            # print train_x,train_x.shape
            index = shopid - 1
            oneshopinfo = shop_info.ix[index]
            shop_city = oneshopinfo['cityname']
            shop_perpay = oneshopinfo['perpay'] if not pd.isnull(
                oneshopinfo['perpay']) else 0
            shop_score = oneshopinfo['score'] if not pd.isnull(
                oneshopinfo['score']) else 0
            shop_comment = oneshopinfo['comment'] if not pd.isnull(
                oneshopinfo['comment']) else 0
            shop_level = oneshopinfo['level'] if not pd.isnull(
                oneshopinfo['level']) else 0
            shop_cate1 = oneshopinfo['cate1']
            import warnings
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", category=DeprecationWarning)
                shop_cate1_encoder = cate1_hot_encoder.transform(
                    cate1_label_encoder.transform([shop_cate1]))
            if "perpay" in shop_features:
                train_x = np.insert(train_x,
                                    train_x.shape[1],
                                    shop_perpay,
                                    axis=1)
            if "score" in shop_features:
                train_x = np.insert(train_x,
                                    train_x.shape[1],
                                    shop_score,
                                    axis=1)
            if "comment" in shop_features:
                train_x = np.insert(train_x,
                                    train_x.shape[1],
                                    shop_comment,
                                    axis=1)
            if "level" in shop_features:
                train_x = np.insert(train_x,
                                    train_x.shape[1],
                                    shop_level,
                                    axis=1)
            if "cate1" in shop_features:
                for i in range(shop_cate1_encoder.shape[1]):
                    train_x = np.insert(train_x,
                                        train_x.shape[1],
                                        shop_cate1_encoder[0][i],
                                        axis=1)
            '''商家信息添加完毕'''
            '''天气特征'''
            if weather:
                weathers = getOneWeekdayFomExtractedData(
                    extractWeatherInfo(part_data, skipNum, shop_city))
                train_x = np.append(train_x, weathers, axis=1)
            '''天气特征结束'''

            if all_x is None:
                all_x = train_x
                all_y = train_y
            else:
                all_x = np.insert(all_x, all_x.shape[0], train_x, axis=0)
                all_y = np.insert(all_y, all_y.shape[0], train_y, axis=0)

                # '''添加周几'''
                # extract_weekday = getOneWeekdayFomExtractedData(extractWeekday(part_data, skipNum))
                # train_x = np.append(train_x, extract_weekday, axis=1)
                # ''''''

                # train_x = train_x.reshape((train_x.shape[0],
                #                            train_x.shape[1], 1))
                # print model.get_weights()
                # part_counts = []
                # for i in range(7):
                #     weekday = i + 1
                #     part_count = getOneWeekdayFomExtractedData(count, weekday)
                #     part_counts.append(part_count)

        train_x = all_x
        train_y = all_y

        if needSaveFeature:
            featureAndLabel = np.concatenate((train_x, train_y), axis=1)
            flDF = pd.DataFrame(featureAndLabel)
            if featureSavePath is None:
                if trainAsTest:
                    featureSavePath = Parameter.projectPath + "lzj/train_feature/%dCatelevel_%sCatename_%dfeatures_%dSameday_%dDay_%dLast.csv" % (
                        cate_level, cate_name, flDF.shape[1] - 1,
                        sameday_backNum, day_back_num, last_N_days)
                else:
                    featureSavePath = Parameter.projectPath + "lzj/feature/%dCatelevel_%sCatename_%dfeatures_%dSameday_%dDay_%dLast.csv" % (
                        cate_level, cate_name, flDF.shape[1] - 1,
                        sameday_backNum, day_back_num, last_N_days)
            flDF.to_csv(featureSavePath)
    else:  #有featurePath文件
        if trainAsTest:
            path = Parameter.projectPath + "lzj/train_feature/" + featurePath
        else:
            path = Parameter.projectPath + "lzj/feature/" + featurePath
        flDF = pd.read_csv(path, index_col=0)
        train_x = flDF.values[:, :-1]
        train_y = flDF.values[:, -1:]
        # print train_x
        # print train_y
    '''将t标准化'''
    x_scaler = MinMaxScaler().fit(train_x)
    y_scaler = MinMaxScaler().fit(train_y)
    train_x = x_scaler.transform(train_x)
    train_y = y_scaler.transform(train_y)
    '''标准化结束'''

    if model_path is None:
        if needCV:
            '''gridsearchCV'''
            # nb_epoch=rnn_epoch, batch_size=batch_size, verbose=verbose
            # input_dim, h1_unit = 16, optimizer = "adagrad", init = "normal"):
            input_dim = [train_x.shape[1]]
            h1_activation = ["relu"]
            h1_unit = [8, 12, 16, 20]
            h2_unit = [2, 4, 6, 8, 10]
            model = KerasRegressor(build_fn=create_model2, verbose=verbose)
            batch_size = [3, 5, 7, 10]
            epochs = [10, 15, 20, 25, 30, 40]
            param_grid = dict(batch_size=batch_size,
                              nb_epoch=epochs,
                              h1_unit=h1_unit,
                              h2_unit=h2_unit,
                              input_dim=input_dim)
            grid = GridSearchCV(estimator=model,
                                param_grid=param_grid,
                                n_jobs=-1,
                                scoring="neg_mean_squared_error")
            grid.refit = False
            grid_result = grid.fit(train_x, train_y)

            print("Best: %f using %s" %
                  (grid_result.best_score_, grid_result.best_params_))
            for params, mean_score, scores in grid_result.grid_scores_:
                print("%f (%f) with: %r" %
                      (scores.mean(), scores.std(), params))

        if not needCV:
            input_dim = train_x.shape[1]
            h1_activation = "relu"
            h1_unit = 12
            h2_unit = 8
            batch_size = 3
            epochs = 80

        else:
            input_dim = train_x.shape[1]
            epochs = grid_result.best_params_['nb_epoch']
            batch_size = grid_result.best_params_['batch_size']
            h1_unit = grid_result.best_params_["h1_unit"]
            h2_unit = grid_result.best_params_["h2_unit"]
            h1_activation = "relu"

        early_stopping = EarlyStopping(monitor='val_loss', patience=2)
        best_model = create_model2(input_dim=input_dim,
                                   h1_unit=h1_unit,
                                   h2_unit=h2_unit)
        hist = best_model.fit(train_x,
                              train_y,
                              verbose=verbose,
                              batch_size=batch_size,
                              nb_epoch=epochs,
                              validation_split=0.1,
                              callbacks=[early_stopping])
        print hist.history

        #保存模型
        if trainAsTest:
            model_save_path = Parameter.projectPath+"lzj/train_model/" + \
                              "%dlast_%ds_%dd_%df_%d_%s_%d_%d_%d_%s_%d_%s.json" \
                              % (last_N_days,sameday_backNum, day_back_num, train_x.shape[1], cate_level, cate_name
                                 , epochs, batch_size, h1_unit, h1_activation,h2_unit,h1_activation)
        else:
            model_save_path = Parameter.projectPath+"lzj/model/" + \
                              "%dlast_%ds_%dd_%df_%d_%s_%d_%d_%d_%s_%d_%s.json" \
                              % (last_N_days,sameday_backNum, day_back_num, train_x.shape[1], cate_level, cate_name
                                 ,  epochs, batch_size, h1_unit, h1_activation,h2_unit,h1_activation)
        print "model save in :", model_save_path
        saveModel(model_save_path, best_model)
    else:  #model_path is not none
        best_model = getModel(model_path)

    format = "%Y-%m-%d"
    if trainAsTest:
        startTime = datetime.datetime.strptime("2016-10-18", format)
    else:
        startTime = datetime.datetime.strptime("2016-11-1", format)
    timedelta = datetime.timedelta(1)
    '''预测所有商家'''
    model = best_model
    preficts_all = None
    real_all = None
    for j in shopids:
        if j in ignore_shopids:
            print "ignore predict", j
            continue
        print "predict:", j
        preficts = []
        part_data = all_data[all_data.shopid == j]
        last_14_real_y = None

        if trainAsTest:  #使用训练集后14天作为测试集的话,训练集为前面部分
            last_14_real_y = part_data[len(part_data) - 14:]["count"].values
            part_data = part_data[0:len(part_data) - 14]
        '''预测14天'''
        for i in range(14):
            currentTime = startTime + timedelta * i
            strftime = currentTime.strftime(format)
            # index = getWeekday(strftime) - 1
            # part_count = part_counts[index]
            #取前{sameday_backNum}周同一天的值为特征进行预测
            part_data = part_data.append(
                {
                    "count": 0,
                    "shopid": j,
                    "time": strftime,
                    "weekday": getWeekday(strftime)
                },
                ignore_index=True)
            x = None
            if sameday_backNum != 0:
                x = getOneWeekdayFomExtractedData(
                    extractBackSameday(part_data, sameday_backNum,
                                       part_data.shape[0] - 1,
                                       nan_method_sameday_mean))
            if day_back_num != 0:
                if x is None:
                    x = getOneWeekdayFomExtractedData(
                        extractBackDay(part_data, day_back_num,
                                       part_data.shape[0] - 1,
                                       nan_method_sameday_mean))
                else:
                    x = np.concatenate(
                        (x,
                         getOneWeekdayFomExtractedData(
                             extractBackDay(part_data, day_back_num,
                                            part_data.shape[0] - 1,
                                            nan_method_sameday_mean))),
                        axis=1)
            if weekOrWeekend:
                x = np.concatenate(
                    (x,
                     hot_encoder.transform(
                         getOneWeekdayFomExtractedData(
                             extractWorkOrWeekend(part_data,
                                                  part_data.shape[0] - 1)))),
                    axis=1)

            for feature in other_features:
                x_value = getOneWeekdayFomExtractedData(
                    extractBackWeekValue(part_data, week_backnum,
                                         part_data.shape[0] - 1,
                                         nan_method_sameday_mean, feature))
                x = np.append(x, x_value, axis=1)
            # '''添加周几'''
            # x = np.append(x, getOneWeekdayFomExtractedData(extractWeekday(part_data, part_data.shape[0]-1)), axis=1)
            # ''''''
            '''添加商家信息'''
            index = j - 1
            oneshopinfo = shop_info.ix[index]
            shop_city = oneshopinfo["cityname"]
            shop_perpay = oneshopinfo['perpay'] if not pd.isnull(
                oneshopinfo['perpay']) else 0
            shop_score = oneshopinfo['score'] if not pd.isnull(
                oneshopinfo['score']) else 0
            shop_comment = oneshopinfo['comment'] if not pd.isnull(
                oneshopinfo['comment']) else 0
            shop_level = oneshopinfo['level'] if not pd.isnull(
                oneshopinfo['level']) else 0
            if "perpay" in shop_features:
                x = np.insert(x, x.shape[1], shop_perpay, axis=1)
            if "score" in shop_features:
                x = np.insert(x, x.shape[1], shop_score, axis=1)
            if "comment" in shop_features:
                x = np.insert(x, x.shape[1], shop_comment, axis=1)
            if "level" in shop_features:
                x = np.insert(x, x.shape[1], shop_level, axis=1)
            shop_cate1 = oneshopinfo['cate1']
            if "cate1" in shop_features:
                shop_cate1_encoder = cate1_hot_encoder.transform(
                    cate1_label_encoder.transform([shop_cate1]).reshape(
                        (-1, 1)))
                for i in range(shop_cate1_encoder.shape[1]):
                    x = np.insert(x,
                                  x.shape[1],
                                  shop_cate1_encoder[0][i],
                                  axis=1)
            '''商家信息添加完毕'''
            '''天气特征'''
            if weather:
                weathers = getOneWeekdayFomExtractedData(
                    extractWeatherInfo(part_data, part_data.shape[0] - 1,
                                       shop_city))
                x = np.append(x, weathers, axis=1)
            x = x_scaler.transform(x)
            '''天气特征结束'''
            # for j in range(sameday_backNum):
            #     x.append(train_y[len(train_y) - (j+1)*7][0])
            # x = np.array(x).reshape((1, sameday_backNum))

            # print x
            # x = x.reshape(1, sameday_backNum, 1)
            predict = model.predict(x)
            if predict.ndim == 2:
                predict = y_scaler.inverse_transform(predict)[0][0]
            elif predict.ndim == 1:
                predict = y_scaler.inverse_transform(predict)[0]
            # print predict
            if (predict <= 0):
                predict == 1
            preficts.append(predict)
            part_data.set_value(part_data.shape[0] - 1, "count", predict)

        preficts = (removeNegetive(toInt(np.array(preficts)))).astype(int)
        if preficts_all is None:
            preficts_all = preficts
        else:
            preficts_all = np.insert(preficts_all,
                                     preficts_all.shape[0],
                                     preficts,
                                     axis=0)

        if trainAsTest:
            last_14_real_y = (removeNegetive(toInt(
                np.array(last_14_real_y)))).astype(int)
            if real_all is None:
                real_all = last_14_real_y
            else:
                real_all = np.insert(real_all,
                                     real_all.shape[0],
                                     last_14_real_y,
                                     axis=0)
                # print preficts,last_14_real_y
            print str(j) + ',score:', scoreoneshop(preficts, last_14_real_y)

    # preficts = np.array(preficts)
    preficts_all = preficts_all.reshape((len(shopids) - ignores, 14))
    if trainAsTest:
        real_all = real_all.reshape((len(shopids) - ignores, 14))
        preficts_all = np.concatenate((preficts_all, real_all), axis=1)
    shopids = shopids.tolist()
    for remove in ignore_shopids:
        try:
            shopids.remove(remove)
        except:
            pass
    preficts_all = np.insert(preficts_all, 0, shopids, axis=1)
    if saveFilePath is not None:
        path = saveFilePath + "%dLast_%ds_%dd_%df_%d_%s_%d_%d_%d_%s_%d_%s_%dshops" \
                              % (last_N_days,sameday_backNum, day_back_num, train_x.shape[1], cate_level, cate_name
                                 ,  epochs, batch_size, h1_unit, h1_activation,h2_unit,h1_activation,len(shopids)-ignores)
        if trainAsTest:
            path = path + "_train.csv"
        else:
            path = path + ".csv"

        print "save in :", path
        np.savetxt(path, preficts_all, fmt="%d", delimiter=",")
    return preficts_all
示例#8
0
def computeScoreByOrigin(filePath,
                         scoreFilePath,
                         threshold=0.06,
                         needRefuseDataPath=None,
                         refuseDataPath=None,
                         refuseDataSavePath=None):
    import Parameter
    origin = pd.read_csv(Parameter.payAfterGrouping_path)
    reals = np.ndarray(0)
    train_predict = np.loadtxt(filePath, dtype=int, delimiter=",")
    shopids = train_predict.take(0, axis=1).tolist()
    for shopid in shopids:
        part_data = origin[origin.shopid == shopid]
        last_14_real_y = None
        # 取出一部分做训练集
        last_14_real_y = part_data[len(part_data) - 14:]["count"].values
        reals = np.append(reals, last_14_real_y)
    reals = reals.reshape((len(shopids), 14))
    predicts = np.ndarray(0)
    good = []
    bad = []
    scores = []
    for k in range(len(shopids)):
        id = shopids[k]
        predict = train_predict[k][1:15]
        predicts = np.append(predicts, predict)
        score_one = scoreoneshop(predict, reals[k])
        print id, ":", score_one
        if (score_one < threshold):
            good.append(id)
        else:
            bad.append(id)
        scores.append(score_one)
    print "last score:", score(predicts,
                               reals.reshape(14 * train_predict.shape[0]))
    print "good", good, len(good)
    print "bad", bad, len(bad)

    if scoreFilePath is not None:
        result = np.reshape(scores, (len(shopids), 1))
        result = np.insert(result, 0, shopids, axis=1)
        np.savetxt(scoreFilePath, (result), delimiter=",", fmt="%.6f")

    if needRefuseDataPath is not None:
        needRefuseData = np.loadtxt(needRefuseDataPath,
                                    dtype=int,
                                    delimiter=",")
        refuseData = np.loadtxt(refuseDataPath, dtype=int, delimiter=",")
        refuse_data = np.zeros((2000, 14))
        for i in range(2000):
            shopid = i + 1
            try:
                index = shopids.index(shopid)
            except:
                index = -1
            if shopid in good:
                value = needRefuseData[index][1:15]
            elif shopid in bad:
                value = refuseData[i][1:15]
            else:
                value = refuseData[i][1:15]
            refuse_data[i] = value
        refuse_data = np.insert(refuse_data, 0, range(1, 2001, 1),
                                axis=1).astype(int)
        np.savetxt(refuseDataSavePath, refuse_data, delimiter=",", fmt='%d')
示例#9
0
def merge(paths, base_result_path, final_result_save_path, threshold=-1):
    if threshold != -1:
        bad_idss = []
        origin = pd.read_csv(Parameter.payAfterGrouping_path)
        for path in paths:
            if "train" not in path:
                path = path.replace(".csv", "_train.csv")
            reals = np.ndarray(0)
            train_predict = np.loadtxt(path, dtype=int, delimiter=",")
            shopids = train_predict.take(0, axis=1).tolist()
            for shopid in shopids:
                part_data = origin[origin.shopid == shopid]
                last_14_real_y = None
                # 取出一部分做训练集
                last_14_real_y = part_data[len(part_data) -
                                           14:]["count"].values
                reals = np.append(reals, last_14_real_y)
            reals = reals.reshape((len(shopids), 14))
            predicts = np.ndarray(0)
            bad = []
            for k in range(len(shopids)):
                id = shopids[k]
                predict = train_predict[k][1:15]
                predicts = np.append(predicts, predict)
                score_one = scoreoneshop(predict, reals[k])
                if (score_one > threshold):
                    bad.append(id)
            bad_idss.append(bad)

    train = False
    if "train" in paths[0]:
        train = True
    datas = []
    shopids = []
    indexes = []
    shops = 0
    insert_index = 0
    for path in paths:
        loadtxt = np.loadtxt(path, delimiter=",", dtype=int)
        datas.append(loadtxt)
        shopids.append(loadtxt.take(0, axis=1))
        indexes.append(0)
        shops += loadtxt.shape[0]
    if base_result_path is not None:
        base_data = np.loadtxt(base_result_path, delimiter=",", dtype=int)

    remove = 0
    if bad_idss is not None:
        for badids in bad_idss:
            remove += len(badids)
    print "bad number:", remove
    if base_result_path is not None:
        final_result = np.ndarray((2000, 15))
    else:
        final_result = np.ndarray((shops - remove, 15))
    for shopid in range(1, 2001, 1):
        insert = False
        for j in range(len(datas)):
            if not insert:
                if shopid in bad_idss[j]:
                    indexes[j] += 1
                    continue
                if shopid in shopids[j]:
                    if not train:
                        final_result[insert_index] = datas[j][indexes[j]]
                    else:
                        final_result[insert_index] = datas[j][indexes[j]][0:15]
                    indexes[j] += 1
                    insert = True
                    insert_index += 1

        if not insert:
            if base_result_path is not None:
                final_result[insert_index] = base_data[shopid - 1]
                insert_index += 1
    np.savetxt(final_result_save_path, final_result, fmt="%d", delimiter=",")
示例#10
0
def predictOneShop_ANN_LSTM(shopid, all_data, trainAsTest=False):
    """
    用ANN预测某一个商店,2个网络分别模拟近期趋势和中期趋势,隐藏层合并,1层隐藏层,近期趋势用LSTM,速度慢,而且效果有时候也不好
    :param shopid: 预测商店id
    :param trainAsTest: 是否使用训练集后14天作为测试集
    :return:
    """
    part_data = all_data[all_data.shopid == shopid]
    last_14_real_y = None
    # 取出一部分做训练集
    if trainAsTest: #使用训练集后14天作为测试集的话,训练集为前面部分
        last_14_real_y = part_data[len(part_data) - 14:]["count"].values
        part_data = part_data[0:len(part_data) - 14]
    # print last_14_real_y
    verbose = 2
    rnn_nb_epoch = 10
    skipNum = 28
    day_backNum = 7
    sameday_backNum = 3
    week_backnum = 3
    learnrate = 0.01
    sameday = extractBackSameday(part_data, sameday_backNum, skipNum, nan_method_sameday_mean)
    day = extractBackDay(part_data,day_backNum,skipNum,nan_method_sameday_mean)
    count = extractCount(part_data, skipNum)
    train_x = getOneWeekdayFomExtractedData(sameday)
    train_x2 = getOneWeekdayFomExtractedData(day)
    train_y = getOneWeekdayFomExtractedData(count)
    other_features = [statistic_functon_mean,statistic_functon_median]
    # other_features = []
    for feature in other_features:
        value = getOneWeekdayFomExtractedData(extractBackWeekValue(part_data, week_backnum, skipNum, nan_method_sameday_mean, feature))
        train_x = np.append(train_x, value, axis=1)

    # '''添加周几'''
    # extract_weekday = getOneWeekdayFomExtractedData(extractWeekday(part_data, skipNum))
    # train_x = np.append(train_x, extract_weekday, axis=1)
    # ''''''

    '''将t标准化'''
    x_scaler = MinMaxScaler().fit(train_x)
    x2_scaler = MinMaxScaler().fit(train_x2)
    y_scaler = MinMaxScaler().fit(train_y)
    train_x = x_scaler.transform(train_x)
    train_x2 = x2_scaler.transform(train_x2)
    train_x2 = train_x2.reshape((train_x2.shape[0],
                                 train_x2.shape[1], 1))
    train_y = y_scaler.transform(train_y)
    '''标准化结束'''
    # train_x = train_x.reshape((train_x.shape[0],
    #                            train_x.shape[1], 1))
    model1 = Sequential()
    model2 = Sequential()
    final_model = Sequential()
    # print getrefcount(model1)
    model1.add(Dense(32, input_dim=train_x.shape[1], activation="sigmoid")) #sigmoid
    # model1.add(Dense(1, activation='linear'))


    '''近期趋势'''
    model2.add(LSTM(32, input_shape=(train_x2.shape[1],train_x2.shape[2]), activation="sigmoid"))


    final_model.add(Merge([model1, model2],mode="concat",concat_axis=1))
    final_model.add(Dense(1, activation='linear'))

    #, W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)
    # print getrefcount(model1)
    # 设置优化器(除了学习率外建议保持其他参数不变)
    rms=RMSprop(lr=0.05)
    # sgd=SGD(lr=0.1, momentum=0.9, nesterov=True)
    final_model.compile(loss="mse", optimizer=rms)
    print final_model.summary()
    # print model1.summary()
    # print getrefcount(model1)
    # print model1.summary()
    final_model.fit([train_x, train_x2], train_y, nb_epoch=rnn_nb_epoch, batch_size=1, verbose=verbose)
    # print model1.get_weights()
    # part_counts = []
    # for i in range(7):
    #     weekday = i + 1
    #     part_count = getOneWeekdayFomExtractedData(count, weekday)
    #     part_counts.append(part_count)

    # print getrefcount(model1)
    format = "%Y-%m-%d"
    if trainAsTest:
        startTime = datetime.datetime.strptime("2016-10-18", format)
    else:
        startTime = datetime.datetime.strptime("2016-11-1", format)
    timedelta = datetime.timedelta(1)
    preficts = []
    for i in range(14):
        currentTime = startTime + timedelta * i
        strftime = currentTime.strftime(format)
        # index = getWeekday(strftime) - 1
        # part_count = part_counts[index]
        #取前{sameday_backNum}周同一天的值为特征进行预测
        part_data = part_data.append({"count":0, "shopid":shopid, "time":strftime, "weekday":getWeekday(strftime)}, ignore_index=True)
        x = getOneWeekdayFomExtractedData(extractBackSameday(part_data,sameday_backNum,part_data.shape[0] - 1, nan_method_sameday_mean))
        x2 = getOneWeekdayFomExtractedData(extractBackDay(part_data,day_backNum,part_data.shape[0]-1,nan_method_sameday_mean))
        for feature in other_features:
            x_value = getOneWeekdayFomExtractedData(extractBackWeekValue(part_data, week_backnum, part_data.shape[0]-1, nan_method_sameday_mean, feature))
            x = np.append(x, x_value, axis=1)
        # '''添加周几'''
        # x = np.append(x, getOneWeekdayFomExtractedData(extractWeekday(part_data, part_data.shape[0]-1)), axis=1)
        # ''''''

        x = x_scaler.transform(x)
        x2 = x2_scaler.transform(x2)
        x2 = x2.reshape((x2.shape[0],x2.shape[1],1))
        # for j in range(sameday_backNum):
        #     x.append(train_y[len(train_y) - (j+1)*7][0])
        # x = np.array(x).reshape((1, sameday_backNum))

        # print x
        # x = x.reshape(1, sameday_backNum, 1)
        predict = final_model.predict([x,x2])
        predict = y_scaler.inverse_transform(predict)[0][0]
        if(predict <= 0):
            predict == 1
        preficts.append(predict)
        part_data.set_value(part_data.shape[0]-1, "count", predict)
        # preficts.append(predict)
        # part_counts[index] = np.append(part_count, predict).reshape((part_count.shape[0] + 1, 1))
    preficts = (removeNegetive(toInt(np.array(preficts)))).astype(int)
    # preficts = np.array(preficts)
    if trainAsTest:
        last_14_real_y = (removeNegetive(toInt(np.array(last_14_real_y)))).astype(int)
        # print preficts,last_14_real_y
        print str(shopid)+',score:', scoreoneshop(preficts, last_14_real_y)
    return [preficts, last_14_real_y]
示例#11
0
def predictOneShop_ANN2(shopid, all_data, trainAsTest=False, best_model=None):
    """
    用ANN预测某一个商店,一个隐藏层,一个网络
    :param shopid: 预测商店id
    :param trainAsTest: 是否使用训练集后14天作为测试集
    :return:
    """
    # if trainAsTest is False:
    #     raise Exception("trainAsTest should be True, not support False")
    #     return


    skipNum = 28
    sameday_backNum = 3
    week_backnum = 3

    part_data = all_data[all_data.shopid == shopid]
    last_14_real_y = None
    # 取出一部分做训练集
    if trainAsTest: #使用训练集后14天作为测试集的话,训练集为前面部分
        last_14_real_y = part_data[len(part_data) - 14:]["count"].values
        part_data = part_data[0:len(part_data) - 14]
    # print last_14_real_y
    verbose = 0
    rnn_nb_epoch = 10
    sameday = extractBackSameday(part_data, sameday_backNum, skipNum, nan_method_sameday_mean)
    count = extractCount(part_data, skipNum)
    train_x = getOneWeekdayFomExtractedData(sameday)
    train_y = getOneWeekdayFomExtractedData(count)
    other_features = [statistic_functon_mean,statistic_functon_median]
    for feature in other_features:
        value = getOneWeekdayFomExtractedData(extractBackWeekValue(part_data, week_backnum, skipNum, nan_method_sameday_mean, feature))
        train_x = np.append(train_x, value, axis=1)

    # '''添加周几'''
    # extract_weekday = getOneWeekdayFomExtractedData(extractWeekday(part_data, skipNum))
    # train_x = np.append(train_x, extract_weekday, axis=1)
    # ''''''

    '''将t标准化'''
    x_scaler = MinMaxScaler().fit(train_x)
    y_scaler = MinMaxScaler().fit(train_y)
    train_x = x_scaler.transform(train_x)
    train_y = y_scaler.transform(train_y)
    '''标准化结束'''
    # train_x = train_x.reshape((train_x.shape[0],  train_x.shape[1], 1))
    #
    if best_model is None:
        model = Sequential()
        # print getrefcount(model)
        model.add(Dense(32, input_dim=train_x.shape[1], activation="tanh")) #sigmoid
        # print getrefcount(model)
        model.add(Dense(1, activation='linear'))
        #, W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)
        # print getrefcount(model)
        # 设置优化器(除了学习率外建议保持其他参数不变)
        # sgd = SGD(lr=0.005)
        model.compile(loss="mse", optimizer="sgd")
        # print model.summary()
        # print getrefcount(model)
        # print model.summary()
        model.fit(train_x, train_y, nb_epoch=rnn_nb_epoch, batch_size=1, verbose=verbose)
    else:
        model = best_model
        # model.fit(train_x, train_y, nb_epoch=rnn_nb_epoch, batch_size=1, verbose=verbose)
# print model.get_weights()
    # part_counts = []
    # for i in range(7):
    #     weekday = i + 1
    #     part_count = getOneWeekdayFomExtractedData(count, weekday)
    #     part_counts.append(part_count)
    # print getrefcount(model)
    format = "%Y-%m-%d"
    if trainAsTest:
        startTime = datetime.datetime.strptime("2016-10-18", format)
    else:
        startTime = datetime.datetime.strptime("2016-11-1", format)

    timedelta = datetime.timedelta(1)
    preficts = []
    for i in range(14):
        currentTime = startTime + timedelta * i
        strftime = currentTime.strftime(format)
        # index = getWeekday(strftime) - 1
        # part_count = part_counts[index]
        #取前{sameday_backNum}周同一天的值为特征进行预测
        part_data = part_data.append({"count":0, "shopid":shopid, "time":strftime, "weekday":getWeekday(strftime)},ignore_index=True)
        x = getOneWeekdayFomExtractedData(extractBackSameday(part_data,sameday_backNum,part_data.shape[0] - 1, nan_method_sameday_mean))
        for feature in other_features:
            x_value = getOneWeekdayFomExtractedData(extractBackWeekValue(part_data, week_backnum, part_data.shape[0]-1, nan_method_sameday_mean, feature))
            x = np.append(x, x_value, axis=1)
        # '''添加周几'''
        # x = np.append(x, getOneWeekdayFomExtractedData(extractWeekday(part_data, part_data.shape[0]-1)), axis=1)
        # ''''''

        x = x_scaler.transform(x)
        # for j in range(sameday_backNum):
        #     x.append(train_y[len(train_y) - (j+1)*7][0])
        # x = np.array(x).reshape((1, sameday_backNum))

        # print x
        # x = x.reshape(1, sameday_backNum, 1)
        predict = model.predict(x)
        predict = y_scaler.inverse_transform(predict)[0][0]
        if(predict <= 0):
            predict == 1
        preficts.append(predict)
        part_data.set_value(part_data.shape[0]-1, "count", predict)
        # preficts.append(predict)
        # part_counts[index] = np.append(part_count, predict).reshape((part_count.shape[0] + 1, 1))
    preficts = (removeNegetive(toInt(np.array(preficts)))).astype(int)
    # preficts = np.array(preficts)
    if trainAsTest:
        last_14_real_y = (removeNegetive(toInt(np.array(last_14_real_y)))).astype(int)
        # print preficts,last_14_real_y
        print str(shopid)+',score:', scoreoneshop(preficts, last_14_real_y)
    return [preficts, last_14_real_y, model]
示例#12
0
def predictAllShop_ANN_part_together(all_data,
                                     trainAsTest=False,
                                     saveFilePath=None,
                                     featurePath=None,
                                     cate_level=0,
                                     cate_name=None,
                                     featureSavePath=None,
                                     needSaveFeature=False,
                                     time=1):
    """
    使用所有商家所有数据训练,预测所有商店
    :param trainAsTest: 是否使用训练集后14天作为测试集
    :param model: 某个模型
    :param saveFilePath
    :param featurePath:
    :param cate_level:
    :param cate_name:
    :param featureSavePath:
    :param needSaveFeature:
    :param time:跑第几次
    :return:
    """

    ignores = 0

    shopids = None
    shop_need_to_predict = 2000
    if (cate_level is 0):
        shopids = range(1, 1 + shop_need_to_predict, 1)
    else:
        shopids = Parameter.extractShopValueByCate(cate_level, cate_name)

    shop_info = pd.read_csv(Parameter.shopinfopath,
                            names=[
                                "shopid", "cityname", "locationid", "perpay",
                                "score", "comment", "level", "cate1", "cate2",
                                "cate3"
                            ])
    weekOrWeekend = True
    day_back_num = 21
    sameday_backNum = 7
    week_backnum = 3
    other_features = [statistic_functon_mean, statistic_functon_median]
    other_features = []
    '''将cate1 onehot'''
    cate = shop_info['cate1'].tolist()
    cate_dup = set(cate)
    cates = []
    for i in range(len(cate_dup)):
        cates.append([i])
    hot_encoder = OneHotEncoder().fit(cates)
    dicts = dict(zip(cate_dup, range(len(cate_dup))))
    cate_num = []
    for c in cate:
        cate_num.append([dicts[c]])
    '''cate1 onehot finish'''

    if featurePath is None:

        all_x = None
        all_y = None
        for shopid in shopids:
            if shopid in Parameter.ignore_shopids:
                print "ignore get train", shopid
                ignores += 1
                continue
            print "get ", shopid, " train"
            part_data = all_data[all_data.shopid == shopid]
            last_14_real_y = None
            # 取出一部分做训练集
            if trainAsTest:  #使用训练集后14天作为测试集的话,训练集为前面部分
                last_14_real_y = part_data[len(part_data) -
                                           14:]["count"].values
                part_data = part_data[0:len(part_data) - 14]
            # print last_14_real_y
            skipNum = part_data.shape[0] - 128
            if skipNum < 0:
                skipNum = 0
            train_x = None
            if sameday_backNum != 0:
                sameday = extractBackSameday(part_data, sameday_backNum,
                                             skipNum, nan_method_sameday_mean)
                train_x = getOneWeekdayFomExtractedData(sameday)
            if day_back_num != 0:
                if train_x is not None:
                    train_x = np.concatenate(
                        (train_x,
                         getOneWeekdayFomExtractedData(
                             extractBackDay(part_data, day_back_num, skipNum,
                                            nan_method_sameday_mean))),
                        axis=1)
                else:
                    train_x = getOneWeekdayFomExtractedData(
                        extractBackDay(part_data, day_back_num, skipNum,
                                       nan_method_sameday_mean))
            if weekOrWeekend:
                ws = getOneWeekdayFomExtractedData(
                    extractWorkOrWeekend(part_data, skipNum))
                hot_encoder = onehot(ws)
                train_x = np.concatenate(
                    (train_x, hot_encoder.transform(ws).toarray()), axis=1)
            count = extractCount(part_data, skipNum)
            train_y = getOneWeekdayFomExtractedData(count)
            for feature in other_features:
                value = getOneWeekdayFomExtractedData(
                    extractBackWeekValue(part_data, week_backnum, skipNum,
                                         nan_method_sameday_mean, feature))
                train_x = np.append(train_x, value, axis=1)

            # '''添加商家信息'''
            # # print train_x,train_x.shape
            # index = shopid - 1
            # oneshopinfo = shop_info.ix[index]
            # shop_perpay = oneshopinfo['perpay'] if not pd.isnull(oneshopinfo['perpay']) else 0
            # shop_score = oneshopinfo['score'] if not pd.isnull(oneshopinfo['score']) else 0
            # shop_comment = oneshopinfo['comment'] if not pd.isnull(oneshopinfo['comment']) else 0
            # shop_level = oneshopinfo['level'] if not pd.isnull(oneshopinfo['level']) else 0
            # shop_cate1 = oneshopinfo['cate1']
            # import warnings
            # with warnings.catch_warnings():
            #     warnings.simplefilter("ignore",category=DeprecationWarning)
            #     shop_cate1_encoder = hot_encoder.transform([dicts[shop_cate1]]).toarray()
            # train_x = np.insert(train_x,train_x.shape[1],shop_perpay,axis=1)
            # train_x = np.insert(train_x,train_x.shape[1],shop_score,axis=1)
            # train_x = np.insert(train_x,train_x.shape[1],shop_comment,axis=1)
            # train_x = np.insert(train_x,train_x.shape[1],shop_level,axis=1)
            # for i in range(shop_cate1_encoder.shape[1]):
            #     train_x = np.insert(train_x,train_x.shape[1],shop_cate1_encoder[0][i],axis=1)
            # '''商家信息添加完毕'''

            if all_x is None:
                all_x = train_x
                all_y = train_y
            else:
                all_x = np.insert(all_x, all_x.shape[0], train_x, axis=0)
                all_y = np.insert(all_y, all_y.shape[0], train_y, axis=0)

                # '''添加周几'''
                # extract_weekday = getOneWeekdayFomExtractedData(extractWeekday(part_data, skipNum))
                # train_x = np.append(train_x, extract_weekday, axis=1)
                # ''''''

                # train_x = train_x.reshape((train_x.shape[0],
                #                            train_x.shape[1], 1))
                # print model.get_weights()
                # part_counts = []
                # for i in range(7):
                #     weekday = i + 1
                #     part_count = getOneWeekdayFomExtractedData(count, weekday)
                #     part_counts.append(part_count)

        train_x = all_x
        train_y = all_y

        if needSaveFeature:
            featureAndLabel = np.concatenate((train_x, train_y), axis=1)
            flDF = pd.DataFrame(
                featureAndLabel,
                columns=[
                    "sameday1", "sameday2", "sameday3", "week_mean1",
                    "week_mean2", "week_mean3", "week_median1", "week_median2",
                    "week_median3", "perpay", "score", "comment", "level",
                    "cate1_1", "cate1_2", "cate1_3", "cate1_4", "cate1_5",
                    "cate1_6", "label"
                ])
            if featureSavePath is None:
                if trainAsTest:
                    featureSavePath = "train_feature/%df_%d_%s.csv" % (
                        flDF.shape[1] - 1, cate_level, cate_name)
                else:
                    featureSavePath = "feature/%df_%d_%s.csv" % (
                        flDF.shape[1] - 1, cate_level, cate_name)
            flDF.to_csv(featureSavePath)
    else:  #有featurePath文件
        flDF = pd.read_csv(featurePath, index_col=0)
        train_x = flDF.values[:, :-1]
        train_y = flDF.values[:, -1:]
        # print train_x
        # print train_y
    '''将t标准化'''
    x_scaler = MinMaxScaler().fit(train_x)
    y_scaler = MinMaxScaler().fit(train_y)
    train_x = x_scaler.transform(train_x)
    train_y = y_scaler.transform(train_y)
    '''标准化结束'''
    '''构造神经网络'''
    h1_activation = "relu"
    rnn_epoch = 60
    verbose = 0
    h_unit = 16
    batch_size = 5
    np.random.seed(128)
    model = Sequential()
    model.add(
        Dense(h_unit,
              init="normal",
              input_dim=train_x.shape[1],
              activation=h1_activation))  #sigmoid
    model.add(
        Dense(1,
              init="normal",
              activation='linear',
              activity_regularizer=activity_l2(0.01)))
    sgd = SGD(0.005)
    # rmsprop = RMSprop(0.01)
    # adagrad = Adagrad(0.05)
    adadelta = Adadelta(0.01)
    adam = Adam(0.0001)
    adamax = Adamax(0.01)
    nadam = Nadam(0.01)
    model.compile(loss="mse", optimizer=adam)
    '''构造结束'''

    model.fit(train_x,
              train_y,
              nb_epoch=rnn_epoch,
              batch_size=batch_size,
              verbose=verbose)

    format = "%Y-%m-%d"
    if trainAsTest:
        startTime = datetime.datetime.strptime("2016-10-18", format)
    else:
        startTime = datetime.datetime.strptime("2016-11-1", format)
    timedelta = datetime.timedelta(1)
    '''预测所有商家'''
    preficts_all = None
    real_all = None
    for j in shopids:
        if j in Parameter.ignore_shopids:
            print "ignore predict", j
            continue
        print "predict:", j
        preficts = []
        part_data = all_data[all_data.shopid == j]
        last_14_real_y = None

        if trainAsTest:  #使用训练集后14天作为测试集的话,训练集为前面部分
            last_14_real_y = part_data[len(part_data) - 14:]["count"].values
            part_data = part_data[0:len(part_data) - 14]
        '''预测14天'''
        for i in range(14):
            currentTime = startTime + timedelta * i
            strftime = currentTime.strftime(format)
            # index = getWeekday(strftime) - 1
            # part_count = part_counts[index]
            #取前{sameday_backNum}周同一天的值为特征进行预测
            part_data = part_data.append(
                {
                    "count": 0,
                    "shopid": j,
                    "time": strftime,
                    "weekday": getWeekday(strftime)
                },
                ignore_index=True)
            x = None
            if sameday_backNum != 0:
                x = getOneWeekdayFomExtractedData(
                    extractBackSameday(part_data, sameday_backNum,
                                       part_data.shape[0] - 1,
                                       nan_method_sameday_mean))
            if day_back_num != 0:
                if x is None:
                    x = getOneWeekdayFomExtractedData(
                        extractBackDay(part_data, day_back_num,
                                       part_data.shape[0] - 1,
                                       nan_method_sameday_mean))
                else:
                    x = np.concatenate(
                        (x,
                         getOneWeekdayFomExtractedData(
                             extractBackDay(part_data, day_back_num,
                                            part_data.shape[0] - 1,
                                            nan_method_sameday_mean))),
                        axis=1)
            if weekOrWeekend:
                x = np.concatenate(
                    (x,
                     hot_encoder.transform(
                         getOneWeekdayFomExtractedData(
                             extractWorkOrWeekend(
                                 part_data,
                                 part_data.shape[0] - 1))).toarray()),
                    axis=1)
            for feature in other_features:
                x_value = getOneWeekdayFomExtractedData(
                    extractBackWeekValue(part_data, week_backnum,
                                         part_data.shape[0] - 1,
                                         nan_method_sameday_mean, feature))
                x = np.append(x, x_value, axis=1)
            # '''添加周几'''
            # x = np.append(x, getOneWeekdayFomExtractedData(extractWeekday(part_data, part_data.shape[0]-1)), axis=1)
            # ''''''

            # '''添加商家信息'''
            # index = j - 1
            # oneshopinfo = shop_info.ix[index]
            # shop_perpay = oneshopinfo['perpay'] if not pd.isnull(oneshopinfo['perpay']) else 0
            # shop_score = oneshopinfo['score'] if not pd.isnull(oneshopinfo['score']) else 0
            # shop_comment = oneshopinfo['comment'] if not pd.isnull(oneshopinfo['comment']) else 0
            # shop_level = oneshopinfo['level'] if not pd.isnull(oneshopinfo['level']) else 0
            # shop_cate1 = oneshopinfo['cate1']
            # import warnings
            # with warnings.catch_warnings():
            #     warnings.simplefilter("ignore",category=DeprecationWarning)
            #     shop_cate1_encoder = hot_encoder.transform([dicts[shop_cate1]]).toarray()
            # x = np.insert(x,x.shape[1],shop_perpay,axis=1)
            # x = np.insert(x,x.shape[1],shop_score,axis=1)
            # x = np.insert(x,x.shape[1],shop_comment,axis=1)
            # x = np.insert(x,x.shape[1],shop_level,axis=1)
            # for i in range(shop_cate1_encoder.shape[1]):
            #     x = np.insert(x,x.shape[1],shop_cate1_encoder[0][i],axis=1)
            # '''商家信息添加完毕'''

            x = x_scaler.transform(x)
            # for j in range(sameday_backNum):
            #     x.append(train_y[len(train_y) - (j+1)*7][0])
            # x = np.array(x).reshape((1, sameday_backNum))

            # print x
            # x = x.reshape(1, sameday_backNum, 1)
            predict = model.predict(x)
            if predict.ndim == 2:
                predict = y_scaler.inverse_transform(predict)[0][0]
            elif predict.ndim == 1:
                predict = y_scaler.inverse_transform(predict)[0]

            if (predict <= 0):
                predict == 1
            preficts.append(predict)
            part_data.set_value(part_data.shape[0] - 1, "count", predict)

        preficts = (removeNegetive(toInt(np.array(preficts)))).astype(int)
        if preficts_all is None:
            preficts_all = preficts
        else:
            preficts_all = np.insert(preficts_all,
                                     preficts_all.shape[0],
                                     preficts,
                                     axis=0)

        if trainAsTest:
            last_14_real_y = (removeNegetive(toInt(
                np.array(last_14_real_y)))).astype(int)
            if real_all is None:
                real_all = last_14_real_y
            else:
                real_all = np.insert(real_all,
                                     real_all.shape[0],
                                     last_14_real_y,
                                     axis=0)
                # print preficts,last_14_real_y
            print str(j) + ',score:', scoreoneshop(preficts, last_14_real_y)

    # preficts = np.array(preficts)
    preficts_all = preficts_all.reshape((len(shopids) - ignores, 14))
    if trainAsTest:
        real_all = real_all.reshape((len(shopids) - ignores, 14))
        preficts_all = np.concatenate((preficts_all, real_all), axis=1)
    shopids = shopids.tolist()
    for remove in Parameter.ignore_shopids:
        try:
            shopids.remove(remove)
        except:
            pass
    preficts_all = np.insert(preficts_all, 0, shopids, axis=1)
    if saveFilePath is not None:
        path = saveFilePath + "_%ds_%dd_%df_%d_%s_%d_%d_%d_%s_%dtime.csv" \
                              % (sameday_backNum, day_back_num, train_x.shape[1],cate_level,cate_name
                                 ,rnn_epoch,batch_size,h_unit,h1_activation,time)
        print "save in :", path
        np.savetxt(path, preficts_all, fmt="%d", delimiter=",")
    return preficts_all
示例#13
0
def predictOneShop_LSTM(shopid, all_data, trainAsTest=False):
    """
    用SRN预测某一个商店
    :param shopid: 预测商店id
    :param trainAsTest: 是否使用训练集后14天作为测试集
    :return:
    """
    part_data = all_data[all_data.shopid == shopid]
    last_14_real_y = None
    # 取出一部分做训练集
    if trainAsTest: #使用训练集后14天作为测试集的话,训练集为前面部分
        last_14_real_y = part_data[len(part_data) - 14:]["count"].values
        part_data = part_data[0:len(part_data) - 14]
    # print last_14_real_y
    verbose = 2
    rnn_nb_epoch = 5
    skipNum = 0
    backNum = 14
    learnrate = 0.01
    sameday = extractBackDay(part_data, backNum, skipNum, nan_method_sameday_mean)
    count = extractCount(part_data, skipNum)
    train_x = getOneWeekdayFomExtractedData(sameday)
    train_y = getOneWeekdayFomExtractedData(count)
    '''将t标准化'''
    x_scaler = MinMaxScaler().fit(train_x)
    y_scaler = MinMaxScaler().fit(train_y)
    train_x = x_scaler.transform(train_x)
    train_y = y_scaler.transform(train_y)
    '''标准化结束'''
    train_x = train_x.reshape((train_x.shape[0],
                               train_x.shape[1], 1))
    model = Sequential()
    # print getrefcount(model)
    model.add(LSTM(32, input_shape=(train_x.shape[1], train_x.shape[2]), activation="tanh")) #sigmoid
    # print getrefcount(model)
    model.add(Dense(1, activation='linear'))
    #, W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)
    # print getrefcount(model)
    # 设置优化器(除了学习率外建议保持其他参数不变)
    rms = RMSprop(lr=learnrate)
    # sgd=SGD(lr=0.1, momentum=0.9, nesterov=True)
    model.compile(loss="mse", optimizer=rms)
    print model.summary()
    # print getrefcount(model)
    # print model.summary()
    model.fit(train_x, train_y, nb_epoch=rnn_nb_epoch, batch_size=1, verbose=verbose)
    # print model.get_weights()
    # part_counts = []
    # for i in range(7):
    #     weekday = i + 1
    #     part_count = getOneWeekdayFomExtractedData(count, weekday)
    #     part_counts.append(part_count)

    # print getrefcount(model)
    format = "%Y-%m-%d"
    if trainAsTest:
        startTime = datetime.datetime.strptime("2016-10-18", format)
    else:
        startTime = datetime.datetime.strptime("2016-11-1", format)
    timedelta = datetime.timedelta(1)
    preficts = []
    for i in range(14):
        currentTime = startTime + timedelta * i
        strftime = currentTime.strftime(format)
        # index = getWeekday(strftime) - 1
        # part_count = part_counts[index]
        #取前{backNum}周同一天的值为特征进行预测
        x=[]
        for j in range(backNum):
            x.append(train_y[len(train_y) - (j+1)][0])
        x = np.array(x)
        # print x
        x = x.reshape(1, backNum, 1)
        predict = model.predict(x)
        preficts.append(y_scaler.inverse_transform(predict)[0][0])
        train_y = np.append(train_y,predict).reshape((train_y.shape[0] + 1,1))
        # preficts.append(predict)
        # part_counts[index] = np.append(part_count, predict).reshape((part_count.shape[0] + 1, 1))
    preficts = (removeNegetive(toInt(np.array(preficts)))).astype(int)
    # preficts = np.array(preficts)
    if trainAsTest:
        last_14_real_y = (removeNegetive(toInt(np.array(last_14_real_y)))).astype(int)
        # print preficts,last_14_real_y
        print str(shopid)+',score:', scoreoneshop(preficts, last_14_real_y)
    return [preficts, last_14_real_y]
示例#14
0
def predict_all_getbest(all_data, save_filename, trainAsTest=False, region=None, predict_function = None, epoch_n = 1):
    """
    预测所有商店后14天的值
    :param all_data:
    :param save_filename: trainAsTest为true,则存储predict+real的值,前面14列为predict,后面14列为real,否则只有前面14列
    :param trainAsTest: 是否把训练集后14天当作测试集
    :param region: shopid区域,list,[startid,endid]
    :param predict_function: 预测单一商店的函数
    :param epoch_n: 每个商店循环多少次,从中取出最好的模型进行预测
    :return:
    """
    if predict_function is None:
        raise Exception("predict_function is none")
        return

    if region is None:
        startid = 1
        endid = 2000
    else:
        startid = region[0]
        endid = region[1]
    size = endid - startid + 1

    if not trainAsTest:
        result = np.zeros((size, 14))
        train_result = np.zeros((size, 28))
    else:
        result = np.zeros((size, 28))
    real = np.ndarray(0)
    for i in range(startid, endid + 1, 1):
        shopid = i
        print "shopid:", shopid
        best_score = 1
        best_model = None
        for j in range(epoch_n):
            train_predict, train_real_14, model = predict_function(shopid, all_data, True, None)
            scoreoneshop1 = scoreoneshop(train_predict, train_real_14)
            if(scoreoneshop1<best_score):
                best_score = scoreoneshop1
                best_model = model
        print "best:", best_score
        predict, real_14, model = predict_function(shopid, all_data, trainAsTest, best_model)
        # predict = real_14 = np.arange(14)
        if trainAsTest:
            real = np.append(real, real_14)
            predict = np.append(predict,real_14)
        train_predict_real = np.append(train_predict, train_real_14)
        result[i-startid] = predict
        train_result[i-startid] = train_predict_real
        gc.collect()


    result = pd.DataFrame(result.astype(np.int))
    train_result = pd.DataFrame(train_result.astype(int))
    result.insert(0, "id", value=range(startid, endid + 1, 1))
    train_result.insert(0, "id", value=range(startid, endid + 1, 1))
    # print result
    result = result.values
    train_result = train_result.values
    if(save_filename is not None):
        np.savetxt(save_filename, result, delimiter=",", fmt='%d')
        np.savetxt(save_filename+".train", train_result, delimiter=",", fmt='%d')
    else:
        print result
    return result
示例#15
0
def predictOneShop_model(shopid, all_data, trainAsTest=False, model=None):
    """
    用RidgeCV预测某一个商店
    :param shopid: 预测商店id
    :param trainAsTest: 是否使用训练集后14天作为测试集
    :param model: 某个模型
    :return:
    """

    if (model is None):
        raise Exception("model is None ")
        return

    part_data = all_data[all_data.shopid == shopid]
    last_14_real_y = None
    # 取出一部分做训练集
    if trainAsTest:  #使用训练集后14天作为测试集的话,训练集为前面部分
        last_14_real_y = part_data[len(part_data) - 14:]["count"].values
        part_data = part_data[0:len(part_data) - 14]
    # print last_14_real_y
    verbose = 2
    rnn_nb_epoch = 20
    skipNum = 28
    sameday_backNum = 3
    week_backnum = 3
    sameday = extractBackSameday(part_data, sameday_backNum, skipNum,
                                 nan_method_sameday_mean)
    count = extractCount(part_data, skipNum)
    train_x = getOneWeekdayFomExtractedData(sameday)
    train_y = getOneWeekdayFomExtractedData(count)
    other_features = [statistic_functon_mean, statistic_functon_median]
    for feature in other_features:
        value = getOneWeekdayFomExtractedData(
            extractBackWeekValue(part_data, week_backnum, skipNum,
                                 nan_method_sameday_mean, feature))
        train_x = np.append(train_x, value, axis=1)

    # '''添加周几'''
    # extract_weekday = getOneWeekdayFomExtractedData(extractWeekday(part_data, skipNum))
    # train_x = np.append(train_x, extract_weekday, axis=1)
    # ''''''
    '''将t标准化'''
    x_scaler = MinMaxScaler().fit(train_x)
    y_scaler = MinMaxScaler().fit(train_y)
    train_x = x_scaler.transform(train_x)
    train_y = y_scaler.transform(train_y)
    '''标准化结束'''
    # train_x = train_x.reshape((train_x.shape[0],
    #                            train_x.shape[1], 1))
    # print model.get_weights()
    # part_counts = []
    # for i in range(7):
    #     weekday = i + 1
    #     part_count = getOneWeekdayFomExtractedData(count, weekday)
    #     part_counts.append(part_count)

    # print getrefcount(model)

    model.fit(train_x, train_y)

    format = "%Y-%m-%d"
    if trainAsTest:
        startTime = datetime.datetime.strptime("2016-10-18", format)
    else:
        startTime = datetime.datetime.strptime("2016-11-1", format)
    timedelta = datetime.timedelta(1)
    preficts = []
    for i in range(14):
        currentTime = startTime + timedelta * i
        strftime = currentTime.strftime(format)
        # index = getWeekday(strftime) - 1
        # part_count = part_counts[index]
        #取前{sameday_backNum}周同一天的值为特征进行预测
        part_data = part_data.append(
            {
                "count": 0,
                "shopid": shopid,
                "time": strftime,
                "weekday": getWeekday(strftime)
            },
            ignore_index=True)
        x = getOneWeekdayFomExtractedData(
            extractBackSameday(part_data, sameday_backNum,
                               part_data.shape[0] - 1,
                               nan_method_sameday_mean))
        for feature in other_features:
            x_value = getOneWeekdayFomExtractedData(
                extractBackWeekValue(part_data, week_backnum,
                                     part_data.shape[0] - 1,
                                     nan_method_sameday_mean, feature))
            x = np.append(x, x_value, axis=1)
        # '''添加周几'''
        # x = np.append(x, getOneWeekdayFomExtractedData(extractWeekday(part_data, part_data.shape[0]-1)), axis=1)
        # ''''''

        x = x_scaler.transform(x)
        # for j in range(sameday_backNum):
        #     x.append(train_y[len(train_y) - (j+1)*7][0])
        # x = np.array(x).reshape((1, sameday_backNum))

        # print x
        # x = x.reshape(1, sameday_backNum, 1)
        predict = model.predict(x)
        if predict.ndim == 2:
            predict = y_scaler.inverse_transform(predict)[0][0]
        elif predict.ndim == 1:
            predict = y_scaler.inverse_transform(predict)[0]

        if (predict <= 0):
            predict == 1
        preficts.append(predict)
        part_data.set_value(part_data.shape[0] - 1, "count", predict)
        # preficts.append(predict)
        # part_counts[index] = np.append(part_count, predict).reshape((part_count.shape[0] + 1, 1))
    preficts = (removeNegetive(toInt(np.array(preficts)))).astype(int)
    # preficts = np.array(preficts)
    if trainAsTest:
        last_14_real_y = (removeNegetive(toInt(
            np.array(last_14_real_y)))).astype(int)
        # print preficts,last_14_real_y
        print str(shopid) + ',score:', scoreoneshop(preficts, last_14_real_y)
    return [preficts, last_14_real_y]