예제 #1
0
def fout(dict, embedDim, interval, distance):
    print 'Run task %s...' % (os.getpid(), )
    plt.figure(figsize=(6, 4))
    artistNum = 0
    for artistId, artist in dict.items():
        artistNum += 1
        print artistNum
        savePath = os.path.join(utils.resultPath, artistId)
        if not os.path.exists(savePath):
            os.makedirs(savePath)
        # yPredictSum1 = np.zeros(testsize)
        # yPredictSum2 = np.zeros(distance)
        yPredictSum3 = np.zeros(distance)
        yPredictSum4 = np.zeros(distance)
        for songId, song in artist.getSongsOwned().items():
            traceLength = np.array(song.getTrace()).shape[1]
            trainLength = pp.XTrainLength(traceLength, embedDim, interval,
                                          7)  # 训练集长度
            if trainLength < 10:  # 训练集长度不足10的歌曲
                print 'iterated ' + str(traceLength) + ' ' + str(trainLength)
                # # SVR模型
                # yPredict1 = genModel(artist, song, svrModel, embedDim, interval)
                # # 随机森林模型
                # yPredict2 = genModel(artist, song, rfModel, embedDim, interval, distance)
                # GBRT模型
                yPredict3 = itergenModel(artist, song, itergbrtModel, embedDim,
                                         interval, distance)
                # 完全随机森林模型
                yPredict4 = itergenModel(artist, song, itererfModel, embedDim,
                                         interval, distance)
            else:
                print 'mix2 ' + str(traceLength) + ' ' + str(trainLength)
                # # SVR模型
                # yPredict1 = genModel(artist, song, svrModel, embedDim, interval)
                # # 随机森林模型
                # yPredict2 = genModel(artist, song, rfModel, embedDim, interval, distance)
                # GBRT模型
                yPredict3 = genModel(artist, song, gbrtModel, embedDim,
                                     interval)
                # 完全随机森林模型
                yPredict4 = genModel(artist, song, erfModel, embedDim,
                                     interval)
            plotResult(yPredict3, yPredict4)
            plt.savefig(os.path.join(savePath, 'song ' + songId + ".png"))
            plt.clf()
            # yPredictSum1 += yPredict1
            # yPredictSum2 += yPredict2
            yPredictSum3 += yPredict3
            yPredictSum4 += yPredict4
        plotResult(yPredictSum3, yPredictSum4)
        plt.savefig(os.path.join(savePath, 'artist ' + artistId + ".png"))
        plt.clf()
        # finalResultFile1 = os.path.join(finalResultPath, 'svr.csv')
        # writecsv(finalResultFile1, artistId, yPredictSum1)
        # finalResultFile2 = os.path.join(finalResultPath, 'rf.csv')
        # writecsv(finalResultFile2, artistId, yPredictSum2)
        finalResultFile3 = os.path.join(finalResultPath, 'gbrt.csv')
        writecsv(finalResultFile3, artistId, yPredictSum3)
        finalResultFile4 = os.path.join(finalResultPath, 'erf.csv')
        writecsv(finalResultFile4, artistId, yPredictSum4)
예제 #2
0
def train(artist, embedDim, interval):
    embedDimInit = embedDim
    intervalInit = interval
    distance = 7
    playArray = []
    songsList = []
    shortSongsList = []
    for song in artist.getSongsOwned().values():
        playTrace = song.getTrace()[0]
        traceLength = len(playTrace)
        trainLength = pp.XTrainLength(traceLength, embedDim, interval,
                                      distance)  # 训练集长度
        if trainLength < 10:  # 短歌曲不参与聚类训练
            shortSongsList.append(song)
            continue
        if traceLength < utils.days:
            playTrace = np.hstack(
                (np.zeros(utils.days - traceLength), playTrace))
        playArray.append(playTrace)
        songsList.append(song)
    apc = cluster.AffinityPropagation(damping=0.5,
                                      max_iter=500,
                                      convergence_iter=20,
                                      preference=None,
                                      affinity='euclidean')
    clusterIndex = apc.fit_predict(playArray)
    clusterDict = {}
    for index, song in zip(clusterIndex, songsList):  # 将歌曲聚类
        if index not in clusterDict:
            clusterDict[index] = []
        songList = clusterDict.get(index)
        songList.append(song)

    yPredictSum = np.zeros(60)
    for index, songList in clusterDict.items():
        print 'cluster' + str(index)
        embedDim = embedDimInit
        interval = intervalInit
        tracelist, meanList, varList = makeTraceList(songList)
        XTrainCluster, yTrainCluster = foldTrain(tracelist, embedDim, interval,
                                                 distance)
        kfold = cross_validation.KFold(len(XTrainCluster),
                                       n_folds=5,
                                       shuffle=False)
        params = {
            'n_estimators': randint(20, 200),
            'loss': ['ls', 'lad', 'huber'],
            'learning_rate': uniform(0.01, 0.19),
            'subsample': uniform(0.5, 0.5),
            'max_depth': randint(1, 5),
            'min_samples_split': randint(1, 3),
            'min_samples_leaf': randint(1, 3),
            'max_features': randint(1, len(XTrainCluster[0]))
        }
        bestModels = []
        for i in range(len(yTrainCluster[0])):
            gbrt = GradientBoostingRegressor()
            clf = grid_search.RandomizedSearchCV(gbrt,
                                                 param_distributions=params,
                                                 n_iter=30,
                                                 scoring='mean_squared_error',
                                                 cv=kfold,
                                                 n_jobs=-1)
            clf.fit(XTrainCluster, yTrainCluster[:, i])
            bestModels.append(clf.best_estimator_)

        for i in range(9):
            XTrainCluster, yTrainCluster = foldTrain(tracelist, embedDim,
                                                     interval, distance)
            XPredictCluster = foldPredict(tracelist, embedDim, interval,
                                          distance)
            for k in range(len(songList)):  # 对每首歌曲用同一个类别模型做预测
                XPredict = XPredictCluster[k]
                subyPredict = []
                for j in range(len(yTrainCluster[0])):
                    bestModels[j].fit(XTrainCluster, yTrainCluster[:, j])
                    subyPredict.append(bestModels[j].predict(XPredict))
                tracelist[k] = np.hstack(
                    (tracelist[k],
                     np.array(copy(subyPredict))))  # 将一个模型的预测值作为已知数据,训练下一个模型
            embedDim += distance
        yPredictSum += clusterSum(tracelist, meanList, varList)
    yPredictSum += shortSongsPredict(shortSongsList, embedDimInit, interval)
    return yPredictSum
예제 #3
0
def fout(embedDim, interval, distance):
    artistObjectFile = os.path.join(utils.allResultPath,
                                    'artistsObjectDict.pkl')
    artistsObjectDict = cPickle.load(open(artistObjectFile, 'r'))
    plt.figure(figsize=(6, 4))
    artistNum = 0
    for artistId, artist in artistsObjectDict.items():
        artistNum += 1
        print artistNum
        savePath = os.path.join(utils.resultPath, artistId)
        if not os.path.exists(savePath):
            os.makedirs(savePath)
        # yPredictSum1 = np.zeros(testsize)
        # yPredictSum2 = np.zeros(distance)
        yPredictSum3 = np.zeros(distance)
        yPredictSum4 = np.zeros(distance)
        for song in utils.clusterSongs(artist):
            traceLength = np.array(song.getTrace()).shape[1]
            trainLength = pp.XTrainLength(traceLength, embedDim, interval,
                                          7)  # 训练集长度
            if traceLength <= embedDim:
                print 'iterated ' + str(traceLength) + ' ' + str(trainLength)
                # # SVR模型
                # yPredict1 = genModel(artist, song, svrModel, 1, interval)
                # # 随机森林模型
                # yPredict2 = genModel(artist, song, rfModel, 1, interval)
                # GBRT模型
                yPredict3 = genModel(artist, song, itergbrtModel, 1, interval)
                # 完全随机森林模型
                yPredict4 = genModel(artist, song, itererfModel, 1, interval)
            if trainLength < 10:  # 训练集长度不足10的歌曲
                print 'iterated ' + str(traceLength) + ' ' + str(trainLength)
                # # SVR模型
                # yPredict1 = genModel(artist, song, svrModel, embedDim, interval)
                # # 随机森林模型
                # yPredict2 = genModel(artist, song, rfModel, embedDim, interval)
                # GBRT模型
                yPredict3 = genModel(artist, song, itergbrtModel, embedDim,
                                     interval)
                # 完全随机森林模型
                yPredict4 = genModel(artist, song, itererfModel, embedDim,
                                     interval)
            else:
                print 'clustermix3 ' + str(traceLength) + ' ' + str(
                    trainLength)
                # # SVR模型
                # yPredict1 = genModel(artist, song, svrModel, embedDim, interval)
                # # 随机森林模型
                # yPredict2 = genModel(artist, song, rfModel, embedDim, interval)
                # GBRT模型
                yPredict3 = genModel(artist, song, gbrtModel, embedDim,
                                     interval)
                # 完全随机森林模型
                yPredict4 = genModel(artist, song, erfModel, embedDim,
                                     interval)
            plotResult(yPredict3, yPredict4)
            plt.savefig(
                os.path.join(
                    savePath,
                    'song ' + ''.join(list(song.getId())[:20]) + ".png"))
            plt.clf()
            # yPredictSum1 += yPredict1
            # yPredictSum2 += yPredict2
            yPredictSum3 += yPredict3
            yPredictSum4 += yPredict4
        plotResult(yPredictSum3, yPredictSum4)
        plt.savefig(os.path.join(savePath, 'artist ' + artistId + ".png"))
        plt.clf()
        # finalResultFile1 = os.path.join(finalResultPath, 'svr.csv')
        # writecsv(finalResultFile1, artistId, yPredictSum1)
        # finalResultFile2 = os.path.join(finalResultPath, 'rf.csv')
        # writecsv(finalResultFile2, artistId, yPredictSum2)
        finalResultFile3 = os.path.join(finalResultPath, 'gbrt.csv')
        writecsv(finalResultFile3, artistId, yPredictSum3)
        finalResultFile4 = os.path.join(finalResultPath, 'erf.csv')
        writecsv(finalResultFile4, artistId, yPredictSum4)
예제 #4
0
def test(embedDim, interval, distance):  # distance是预测长度,也是测试集长度
    artistObjectFile = os.path.join(utils.allResultPath,
                                    'artistsObjectDict.pkl')
    artistsObjectDict = cPickle.load(open(artistObjectFile, 'r'))
    # artistF1Score1 = []
    # artistF1Score2 = []
    artistF1Score3 = []
    artistF1Score4 = []
    artistF1Score5 = []
    plt.figure(figsize=(6, 8))
    artistNum = 0
    for artistId, artist in artistsObjectDict.items():
        artistNum += 1
        print artistNum
        savePath = os.path.join(utils.resultPath, artistId)
        if not os.path.exists(savePath):
            os.makedirs(savePath)
        yTestSum = np.zeros(distance)
        # yPredictSum1 = np.zeros(distance)
        # yPredictSum2 = np.zeros(distance)
        yPredictSum3 = np.zeros(distance)
        yPredictSum4 = np.zeros(distance)
        yPredictSum5 = np.zeros(distance)
        for songId, song in artist.getSongsOwned().items():
            traceLength = np.array(song.getTrace()).shape[1]
            trainLength = pp.XTrainLength(traceLength, embedDim, interval,
                                          distance)
            if trainLength <= 8:  # 不测试预测天数之后发行的歌曲
                continue
            if trainLength < 8 + distance:  # 训练集长度不足3的歌曲跳过
                print 'iterated ' + str(traceLength) + ' ' + str(trainLength)
                # # SVR模型
                # yPredict1, yTest = genModel(artist, song, svrModel, embedDim, interval, distance)
                # rmse1 = np.sqrt(mean_squared_error(yTest, yPredict1))
                # nvar1 = utils.normalizedVariation(yTest, yPredict1)
                # # 随机森林模型
                # yPredict2, yTest = genModel(artist, song, rfModel, embedDim, interval, distance)
                # rmse2 = np.sqrt(mean_squared_error(yTest, yPredict2))
                # nvar2 = utils.normalizedVariation(yTest, yPredict2)
                # GBRT模型
                yPredict3, yTest = itergenModel(artist, song, itergbrtModel,
                                                embedDim, interval, distance)
                rmse3 = np.sqrt(mean_squared_error(yTest, yPredict3))
                nvar3 = utils.normalizedVariation(yTest, yPredict3)
                # 完全随机森林模型
                yPredict4, yTest = itergenModel(artist, song, itererfModel,
                                                embedDim, interval, distance)
                rmse4 = np.sqrt(mean_squared_error(yTest, yPredict4))
                nvar4 = utils.normalizedVariation(yTest, yPredict4)
                # xgboost模型
                yPredict5, yTest = itergenModel(artist, song, iterxgbModel,
                                                embedDim, interval, distance)
                rmse5 = np.sqrt(mean_squared_error(yTest, yPredict5))
                nvar5 = utils.normalizedVariation(yTest, yPredict5)
            else:
                print 'direct ' + str(traceLength) + ' ' + str(trainLength)
                # # SVR模型
                # yPredict1, yTest = genModel(artist, song, svrModel, embedDim, interval, distance)
                # rmse1 = np.sqrt(mean_squared_error(yTest, yPredict1))
                # nvar1 = utils.normalizedVariation(yTest, yPredict1)
                # # 随机森林模型
                # yPredict2, yTest = genModel(artist, song, rfModel, embedDim, interval, distance)
                # rmse2 = np.sqrt(mean_squared_error(yTest, yPredict2))
                # nvar2 = utils.normalizedVariation(yTest, yPredict2)
                # GBRT模型
                yPredict3, yTest = genModel(artist, song, gbrtModel, embedDim,
                                            interval, distance)
                rmse3 = np.sqrt(mean_squared_error(yTest, yPredict3))
                nvar3 = utils.normalizedVariation(yTest, yPredict3)
                # 完全随机森林模型
                yPredict4, yTest = genModel(artist, song, erfModel, embedDim,
                                            interval, distance)
                rmse4 = np.sqrt(mean_squared_error(yTest, yPredict4))
                nvar4 = utils.normalizedVariation(yTest, yPredict4)
                # xgboost模型
                yPredict5, yTest = genModel(artist, song, xgbModel, embedDim,
                                            interval, distance)
                rmse5 = np.sqrt(mean_squared_error(yTest, yPredict5))
                nvar5 = utils.normalizedVariation(yTest, yPredict5)
            plotResult(yPredict3, yPredict4, yPredict5, yTest)
            plt.title(
                #   'SVR=RMSE:' + str(rmse1) + '-nvar:' + str(nvar1) + '\n' + \
                #   'RF=RMSE:' + str(rmse2) + '-nvar:' + str(nvar2) + '\n' + \
                'GBRT=RMSE:' + str(rmse3) + '-nvar:' + str(nvar3) + '\n' + \
                'erf=RMSE:' + str(rmse4) + '-nvar:' + str(nvar4) + '\n' + \
                'xgb=RMSE:' + str(rmse5) + '-nvar:' + str(nvar5)
            )
            plt.savefig(os.path.join(savePath, 'song ' + songId + ".png"))
            plt.clf()
            yTestSum += yTest
            # yPredictSum1 += yPredict1
            # yPredictSum2 += yPredict2
            yPredictSum3 += yPredict3
            yPredictSum4 += yPredict4
            yPredictSum5 += yPredict5
        # rmseSum1 = np.sqrt(mean_squared_error(yTestSum, yPredictSum1))
        # nvarSum1 = utils.normalizedVariation(yTestSum, yPredictSum1)
        # rmseSum2 = np.sqrt(mean_squared_error(yTestSum, yPredictSum2))
        # nvarSum2 = utils.normalizedVariation(yTestSum, yPredictSum2)
        rmseSum3 = np.sqrt(mean_squared_error(yTestSum, yPredictSum3))
        nvarSum3 = utils.normalizedVariation(yTestSum, yPredictSum3)
        rmseSum4 = np.sqrt(mean_squared_error(yTestSum, yPredictSum4))
        nvarSum4 = utils.normalizedVariation(yTestSum, yPredictSum4)
        rmseSum5 = np.sqrt(mean_squared_error(yTestSum, yPredictSum5))
        nvarSum5 = utils.normalizedVariation(yTestSum, yPredictSum5)
        plotResult(yPredictSum3, yPredictSum4, yPredictSum5, yTestSum)
        plt.title(
            # 'SVR=RMSE:' + str(rmseSum1) + '-nvar:' + str(nvarSum1) + '\n' + \
            # 'RF=RMSE:' + str(rmseSum2) + '-nvar:' + str(nvarSum2) + '\n' + \
            'GBRT=RMSE:' + str(rmseSum3) + '-nvar:' + str(nvarSum3) + '\n' + \
            'erf=RMSE:' + str(rmseSum4) + '-nvar:' + str(nvarSum4) + '\n' + \
            'xgb=RMSE:' + str(rmseSum5) + '-nvar:' + str(nvarSum5)
        )
        plt.savefig(os.path.join(savePath, 'artist ' + artistId + ".png"))
        plt.clf()
        artistWeight = np.sqrt(np.sum(artist.getTotalTrace()[0, -distance:]))
        # artistF1Score1.append(artistWeight * (1 - nvarSum1))
        # artistF1Score2.append(artistWeight * (1 - nvarSum2))
        artistF1Score3.append(artistWeight * (1 - nvarSum3))
        artistF1Score4.append(artistWeight * (1 - nvarSum4))
        artistF1Score5.append(artistWeight * (1 - nvarSum5))
    f1ScoreFile = os.path.join(utils.allResultPath, 'F1Score')
    with open(f1ScoreFile, 'a') as file:
        file.write(time.asctime())
        file.write('embedDim=' + str(embedDim) + ', interval=' +
                   str(interval) + ', distance=' + str(distance))
        file.write('\n')
        # file.write('SVR:' + str(np.sum(artistF1Score1)) + '\n' + str(artistF1Score1) + '\n')
        # file.write('-RF:' + str(np.sum(artistF1Score2)) + '\n' + str(artistF1Score2) + '\n')
        file.write('-GBRT:' + str(np.sum(artistF1Score3)) + '\n' +
                   str(artistF1Score3) + '\n')
        file.write('-erf:' + str(np.sum(artistF1Score4)) + '\n' +
                   str(artistF1Score4) + '\n')
        file.write('-xgb:' + str(np.sum(artistF1Score5)) + '\n' +
                   str(artistF1Score5) + '\n')
        file.write('\n')
예제 #5
0
def fout(embedDim, interval, distance):
    artistObjectFile = os.path.join(utils.allResultPath, 'artistsObjectDict.pkl')
    artistsObjectDict = cPickle.load(open(artistObjectFile, 'r'))
    modelParamsDict = {}
    plt.figure(figsize=(6, 4))
    artistNum = 0
    for artistId, artist in artistsObjectDict.items():
        artistNum += 1
        print artistNum
        modelParamsDict[artistId] = {}
        savePath = os.path.join(utils.resultPath, artistId)
        if not os.path.exists(savePath):
            os.makedirs(savePath)
        # yPredictSum1 = np.zeros(testsize)
        # yPredictSum2 = np.zeros(distance)
        yPredictSum3 = np.zeros(distance)
        yPredictSum4 = np.zeros(distance)
        yPredictSum5 = np.zeros(distance)
        for songId, song in artist.getSongsOwned().items():
            traceLength = np.array(song.getTrace()).shape[1]
            trainLength = pp.XTrainLength(traceLength, embedDim, interval, distance)
            if trainLength < 8:  # 训练集长度不足8的歌曲跳过
                print 'iterated ' + str(traceLength) + ' ' + str(trainLength)
                # # SVR模型
                # yPredict1 = genModel(artist, song, svrModel, embedDim, interval, testsize)
                # # 随机森林模型
                # yPredict2 = genModel(artist, song, rfModel, embedDim, interval, distance)
                # GBRT模型
                yPredict3 = itergenModel(artist, song, itergbrtModel, embedDim, interval, distance)
                # 完全随机森林模型
                yPredict4 = itergenModel(artist, song, itererfModel, embedDim, interval, distance)
                # xgboost模型
                yPredict5 = itergenModel(artist, song, iterxgbModel, embedDim, interval, distance)
            else:
                print 'direct ' + str(traceLength) + ' ' + str(trainLength)
                # # SVR模型
                # yPredict1 = genModel(artist, song, svrModel, embedDim, interval, testsize)
                # # 随机森林模型
                # yPredict2 = genModel(artist, song, rfModel, embedDim, interval, distance)
                # GBRT模型
                yPredict3 = genModel(artist, song, gbrtModel, embedDim, interval, distance)
                # 完全随机森林模型
                yPredict4 = genModel(artist, song, erfModel, embedDim, interval, distance)
                # xgboost模型
                yPredict5 = genModel(artist, song, xgbModel, embedDim, interval, distance)
            plotResult(yPredict3, yPredict4, yPredict5)
            plt.savefig(os.path.join(savePath, 'song ' + songId + ".png"))
            plt.clf()
            # yPredictSum1 += yPredict1
            # yPredictSum2 += yPredict2
            yPredictSum3 += yPredict3
            yPredictSum4 += yPredict4
            yPredictSum5 += yPredict5
        plotResult(yPredictSum3, yPredictSum4, yPredictSum5)
        plt.savefig(os.path.join(savePath, 'artist ' + artistId + ".png"))
        plt.clf()
        # finalResultFile1 = os.path.join(finalResultPath, 'svr.csv')
        # writecsv(finalResultFile1, artistId, yPredictSum1)
        # finalResultFile2 = os.path.join(finalResultPath, 'rf.csv')
        # writecsv(finalResultFile2, artistId, yPredictSum2)
        finalResultFile3 = os.path.join(finalResultPath, 'gbrt.csv')
        writecsv(finalResultFile3, artistId, yPredictSum3)
        finalResultFile4 = os.path.join(finalResultPath, 'erf.csv')
        writecsv(finalResultFile4, artistId, yPredictSum4)
        finalResultFile5 = os.path.join(finalResultPath, 'xgb.csv')
        writecsv(finalResultFile5, artistId, yPredictSum5)