예제 #1
0
파일: record.py 프로젝트: jasonfreak/almsc
def _analyze(recordIdList):
    n_artists = get_n_artists()
    n_days = get_n_days(isX=False, isTrain=False)

    resultDict = dict()
    for recordId in recordIdList:
        resultDict[recordId] = getPredict(recordId)

    pdf = PdfPages('../report/record.pdf')
    for i in range(n_artists):
        fig = plt.figure()
        ax = plt.axes()
        ax.xaxis.set_major_formatter(DateFormatter('%m%d'))
        for recordId in recordIdList:
            result = resultDict[recordId]
            dsList = result[:,1]
            firstDay = datetime.strptime(dsList[0], '%Y%m%d')
            artist_id = result[i*n_days,0]
            xData = np.arange(n_days) + date2num(firstDay)
            yData = result[i*n_days:(i+1)*n_days,2]
            plt.plot_date(xData, yData, fmt='-', label=recordId)
        plt.legend(loc='best', shadow=True)
        plt.xlabel('day')
        plt.ylabel('plays')
        plt.title(artist_id)
        pdf.savefig(fig)
        plt.close()
    pdf.close()
예제 #2
0
def showPredict():
    n_artists = get_n_artists()
    n_days = get_n_days(isX=False, isTrain=False)
    artistIdList, dsList, yReal, yPredict = predict(isOffline=ISOFFLINE)
    yTrain = getPlays(isTrain=True)
    yTrain = yTrain.reshape(n_artists, n_days)

    firstDay = datetime.strptime(dsList[0], '%Y%m%d')
    xData = np.arange(n_days) + date2num(firstDay)
    pdf = PdfPages('../report/analyze.pdf')
    for i in range(n_artists):
        fig = plt.figure()
        ax = plt.axes()
        ax.xaxis.set_major_formatter(DateFormatter('%m%d'))
        yRealData = yReal[i]
        yPredictData = yPredict[i]
        yTrainData = yTrain[i]
        artist_id = artistIdList[i*n_days]
        plt.plot_date(xData, yRealData, fmt='-^g', label='real')
        plt.plot_date(xData, yPredictData, fmt='-vr', label='predict')
        plt.plot_date(xData, yTrainData, fmt='-ob', label='train')
        plt.legend(loc='best', shadow=True)
        plt.xlabel('day')
        plt.ylabel('plays')
        plt.title(artist_id)
        pdf.savefig(fig)
        plt.close()
    pdf.close()
예제 #3
0
파일: model.py 프로젝트: jasonfreak/almsc
def predict(isOffline=True):
    artistIdList, dsList, X, y = getFeatures(isTrain=False)
    pipeline = load('dump/model')
    n_artists = get_n_artists()
    n_days = get_n_days(isX=False, isTrain=False)
    yPredictRaw = pipeline.predict(X)
    yPredictRaw[yPredictRaw < 0] = 0
    yPredictRaw = np.round(yPredictRaw).astype('int64')
    if isOffline:
        yPredict = yPredictRaw.reshape(n_artists, n_days)
        yReal = y.reshape(n_artists, n_days)
        yImpute = Imputer(missing_values=0).fit_transform(yReal.T).T
        std = np.sqrt(np.mean(np.power((yPredict - yReal) / yImpute, 2), axis=1))
        precision = 1 - std
        weight = np.sqrt(np.sum(yReal, axis=1))
        realScore =  np.round(np.dot(precision, weight)).astype('int64')
        idealScore = np.round(np.sum(weight)).astype('int64')
        percenctScore = realScore * 1.0 / idealScore
        indexList = range(n_artists)
        indexList = sorted(indexList, key=lambda x:precision[x], reverse=True)
        for i in range(n_artists):
            print '[predict] [%2d] ARTIST_ID[%32s], WEIGHT[%12.4f], PRECISION[%12.4f]' % (indexList[i]+1, artistIdList[indexList[i]*n_days], weight[indexList[i]], precision[indexList[i]])
        print '[CONCLUTION]', realScore, idealScore, percenctScore
        return artistIdList, dsList, yReal, yPredict
    else:
        result = np.hstack((artistIdList.reshape(-1,1), yPredictRaw.reshape(-1,1), dsList.reshape(-1,1)))
        np.savetxt('../data/mars_tianchi_artist_plays_predict.csv', result, fmt='%s', delimiter=',')
예제 #4
0
def _analyze(recordIdList):
    n_artists = get_n_artists()
    n_days = get_n_days(isX=False, isTrain=False)

    resultDict = dict()
    for recordId in recordIdList:
        resultDict[recordId] = getPredict(recordId)

    pdf = PdfPages('../report/record.pdf')
    for i in range(n_artists):
        fig = plt.figure()
        ax = plt.axes()
        ax.xaxis.set_major_formatter(DateFormatter('%m%d'))
        for recordId in recordIdList:
            result = resultDict[recordId]
            dsList = result[:, 1]
            firstDay = datetime.strptime(dsList[0], '%Y%m%d')
            artist_id = result[i * n_days, 0]
            xData = np.arange(n_days) + date2num(firstDay)
            yData = result[i * n_days:(i + 1) * n_days, 2]
            plt.plot_date(xData, yData, fmt='-', label=recordId)
        plt.legend(loc='best', shadow=True)
        plt.xlabel('day')
        plt.ylabel('plays')
        plt.title(artist_id)
        pdf.savefig(fig)
        plt.close()
    pdf.close()
예제 #5
0
def predict(isOffline=True):
    artistIdList, dsList, X, y = getFeatures(isTrain=False)
    pipeline = load('dump/model')
    n_artists = get_n_artists()
    n_days = get_n_days(isX=False, isTrain=False)
    yPredictRaw = pipeline.predict(X)
    yPredictRaw[yPredictRaw < 0] = 0
    yPredictRaw = np.round(yPredictRaw).astype('int64')
    if isOffline:
        yPredict = yPredictRaw.reshape(n_artists, n_days)
        yReal = y.reshape(n_artists, n_days)
        yImpute = Imputer(missing_values=0).fit_transform(yReal.T).T
        std = np.sqrt(
            np.mean(np.power((yPredict - yReal) / yImpute, 2), axis=1))
        precision = 1 - std
        weight = np.sqrt(np.sum(yReal, axis=1))
        realScore = np.round(np.dot(precision, weight)).astype('int64')
        idealScore = np.round(np.sum(weight)).astype('int64')
        percenctScore = realScore * 1.0 / idealScore
        indexList = range(n_artists)
        indexList = sorted(indexList, key=lambda x: precision[x], reverse=True)
        for i in range(n_artists):
            print '[predict] [%2d] ARTIST_ID[%32s], WEIGHT[%12.4f], PRECISION[%12.4f]' % (
                indexList[i] + 1, artistIdList[indexList[i] * n_days],
                weight[indexList[i]], precision[indexList[i]])
        print '[CONCLUTION]', realScore, idealScore, percenctScore
        return artistIdList, dsList, yReal, yPredict
    else:
        result = np.hstack((artistIdList.reshape(-1, 1),
                            yPredictRaw.reshape(-1, 1), dsList.reshape(-1, 1)))
        np.savetxt('../data/mars_tianchi_artist_plays_predict.csv',
                   result,
                   fmt='%s',
                   delimiter=',')
예제 #6
0
파일: record.py 프로젝트: jasonfreak/almsc
def _checkin(recordId):
    db = connect()
    cursor = db.cursor()
    sql = 'delete from mars_tianchi_artist_plays_predict where record_id = \'%s\'' % recordId
    cursor.execute(sql)

    result = np.hstack((np.repeat(recordId, get_n_artists() * get_n_days(isX=False, isTrain=False)).reshape(-1,1), np.loadtxt('../data/mars_tianchi_artist_plays_predict.csv', dtype='str', delimiter=',')))
    for line in result:
        sql = 'insert into mars_tianchi_artist_plays_predict values(\'%s\', \'%s\', \'%s\', %s)' % (line[0], line[1], line[3], line[2]) 
        cursor.execute(sql)

    db.commit()
    db.close()
예제 #7
0
def _checkin(recordId):
    db = connect()
    cursor = db.cursor()
    sql = 'delete from mars_tianchi_artist_plays_predict where record_id = \'%s\'' % recordId
    cursor.execute(sql)

    result = np.hstack(
        (np.repeat(recordId,
                   get_n_artists() *
                   get_n_days(isX=False, isTrain=False)).reshape(-1, 1),
         np.loadtxt('../data/mars_tianchi_artist_plays_predict.csv',
                    dtype='str',
                    delimiter=',')))
    for line in result:
        sql = 'insert into mars_tianchi_artist_plays_predict values(\'%s\', \'%s\', \'%s\', %s)' % (
            line[0], line[1], line[3], line[2])
        cursor.execute(sql)

    db.commit()
    db.close()
예제 #8
0
    cursor = db.cursor()
    sql = 'alter table mars_tianchi_features drop column %s' % name
    try:
        cursor.execute(sql)
    except Exception, e:
        print 'ignore drop column error !!!'
    sql = 'alter table mars_tianchi_features add column (%s float)' % name
    cursor.execute(sql)

    beginXTrain = getBorder(isBegin=True, isX=True, isTrain=True)
    endXTrain = getBorder(isBegin=False, isX=True, isTrain=True)
    beginXTest = getBorder(isBegin=True, isX=True, isTrain=False)
    endXTest = getBorder(isBegin=False, isX=True, isTrain=False)
    beginYTrain = getBorder(isBegin=True, isX=False, isTrain=True)
    endYTrain = getBorder(isBegin=False, isX=False, isTrain=True)
    n_X_days = get_n_days(isX=True, isTrain=True)
    n_y_days = get_n_days(isX=False, isTrain=True)
    n_artists = get_n_artists()
    n_series = get_n_series()
    artistIdList, playsTrainList = getSeries('s_plays',
                                             begin=beginYTrain,
                                             end=endYTrain)
    artistIdList, valTrainList = getSeries(name,
                                           begin=beginXTrain,
                                           end=endXTrain)
    artistIdList, valTestList = getSeries(name, begin=beginXTest, end=endXTest)

    for i in range(n_artists):
        artistId = artistIdList[i * n_y_days]
        print '[artist]', name, artistId, i