def _analyze(recordIdList): n_artists = get_n_artists() n_days = get_n_days(isX=False, isTrain=False) resultDict = dict() for recordId in recordIdList: resultDict[recordId] = getPredict(recordId) pdf = PdfPages('../report/record.pdf') for i in range(n_artists): fig = plt.figure() ax = plt.axes() ax.xaxis.set_major_formatter(DateFormatter('%m%d')) for recordId in recordIdList: result = resultDict[recordId] dsList = result[:,1] firstDay = datetime.strptime(dsList[0], '%Y%m%d') artist_id = result[i*n_days,0] xData = np.arange(n_days) + date2num(firstDay) yData = result[i*n_days:(i+1)*n_days,2] plt.plot_date(xData, yData, fmt='-', label=recordId) plt.legend(loc='best', shadow=True) plt.xlabel('day') plt.ylabel('plays') plt.title(artist_id) pdf.savefig(fig) plt.close() pdf.close()
def showPredict(): n_artists = get_n_artists() n_days = get_n_days(isX=False, isTrain=False) artistIdList, dsList, yReal, yPredict = predict(isOffline=ISOFFLINE) yTrain = getPlays(isTrain=True) yTrain = yTrain.reshape(n_artists, n_days) firstDay = datetime.strptime(dsList[0], '%Y%m%d') xData = np.arange(n_days) + date2num(firstDay) pdf = PdfPages('../report/analyze.pdf') for i in range(n_artists): fig = plt.figure() ax = plt.axes() ax.xaxis.set_major_formatter(DateFormatter('%m%d')) yRealData = yReal[i] yPredictData = yPredict[i] yTrainData = yTrain[i] artist_id = artistIdList[i*n_days] plt.plot_date(xData, yRealData, fmt='-^g', label='real') plt.plot_date(xData, yPredictData, fmt='-vr', label='predict') plt.plot_date(xData, yTrainData, fmt='-ob', label='train') plt.legend(loc='best', shadow=True) plt.xlabel('day') plt.ylabel('plays') plt.title(artist_id) pdf.savefig(fig) plt.close() pdf.close()
def predict(isOffline=True): artistIdList, dsList, X, y = getFeatures(isTrain=False) pipeline = load('dump/model') n_artists = get_n_artists() n_days = get_n_days(isX=False, isTrain=False) yPredictRaw = pipeline.predict(X) yPredictRaw[yPredictRaw < 0] = 0 yPredictRaw = np.round(yPredictRaw).astype('int64') if isOffline: yPredict = yPredictRaw.reshape(n_artists, n_days) yReal = y.reshape(n_artists, n_days) yImpute = Imputer(missing_values=0).fit_transform(yReal.T).T std = np.sqrt(np.mean(np.power((yPredict - yReal) / yImpute, 2), axis=1)) precision = 1 - std weight = np.sqrt(np.sum(yReal, axis=1)) realScore = np.round(np.dot(precision, weight)).astype('int64') idealScore = np.round(np.sum(weight)).astype('int64') percenctScore = realScore * 1.0 / idealScore indexList = range(n_artists) indexList = sorted(indexList, key=lambda x:precision[x], reverse=True) for i in range(n_artists): print '[predict] [%2d] ARTIST_ID[%32s], WEIGHT[%12.4f], PRECISION[%12.4f]' % (indexList[i]+1, artistIdList[indexList[i]*n_days], weight[indexList[i]], precision[indexList[i]]) print '[CONCLUTION]', realScore, idealScore, percenctScore return artistIdList, dsList, yReal, yPredict else: result = np.hstack((artistIdList.reshape(-1,1), yPredictRaw.reshape(-1,1), dsList.reshape(-1,1))) np.savetxt('../data/mars_tianchi_artist_plays_predict.csv', result, fmt='%s', delimiter=',')
def _analyze(recordIdList): n_artists = get_n_artists() n_days = get_n_days(isX=False, isTrain=False) resultDict = dict() for recordId in recordIdList: resultDict[recordId] = getPredict(recordId) pdf = PdfPages('../report/record.pdf') for i in range(n_artists): fig = plt.figure() ax = plt.axes() ax.xaxis.set_major_formatter(DateFormatter('%m%d')) for recordId in recordIdList: result = resultDict[recordId] dsList = result[:, 1] firstDay = datetime.strptime(dsList[0], '%Y%m%d') artist_id = result[i * n_days, 0] xData = np.arange(n_days) + date2num(firstDay) yData = result[i * n_days:(i + 1) * n_days, 2] plt.plot_date(xData, yData, fmt='-', label=recordId) plt.legend(loc='best', shadow=True) plt.xlabel('day') plt.ylabel('plays') plt.title(artist_id) pdf.savefig(fig) plt.close() pdf.close()
def predict(isOffline=True): artistIdList, dsList, X, y = getFeatures(isTrain=False) pipeline = load('dump/model') n_artists = get_n_artists() n_days = get_n_days(isX=False, isTrain=False) yPredictRaw = pipeline.predict(X) yPredictRaw[yPredictRaw < 0] = 0 yPredictRaw = np.round(yPredictRaw).astype('int64') if isOffline: yPredict = yPredictRaw.reshape(n_artists, n_days) yReal = y.reshape(n_artists, n_days) yImpute = Imputer(missing_values=0).fit_transform(yReal.T).T std = np.sqrt( np.mean(np.power((yPredict - yReal) / yImpute, 2), axis=1)) precision = 1 - std weight = np.sqrt(np.sum(yReal, axis=1)) realScore = np.round(np.dot(precision, weight)).astype('int64') idealScore = np.round(np.sum(weight)).astype('int64') percenctScore = realScore * 1.0 / idealScore indexList = range(n_artists) indexList = sorted(indexList, key=lambda x: precision[x], reverse=True) for i in range(n_artists): print '[predict] [%2d] ARTIST_ID[%32s], WEIGHT[%12.4f], PRECISION[%12.4f]' % ( indexList[i] + 1, artistIdList[indexList[i] * n_days], weight[indexList[i]], precision[indexList[i]]) print '[CONCLUTION]', realScore, idealScore, percenctScore return artistIdList, dsList, yReal, yPredict else: result = np.hstack((artistIdList.reshape(-1, 1), yPredictRaw.reshape(-1, 1), dsList.reshape(-1, 1))) np.savetxt('../data/mars_tianchi_artist_plays_predict.csv', result, fmt='%s', delimiter=',')
def _checkin(recordId): db = connect() cursor = db.cursor() sql = 'delete from mars_tianchi_artist_plays_predict where record_id = \'%s\'' % recordId cursor.execute(sql) result = np.hstack((np.repeat(recordId, get_n_artists() * get_n_days(isX=False, isTrain=False)).reshape(-1,1), np.loadtxt('../data/mars_tianchi_artist_plays_predict.csv', dtype='str', delimiter=','))) for line in result: sql = 'insert into mars_tianchi_artist_plays_predict values(\'%s\', \'%s\', \'%s\', %s)' % (line[0], line[1], line[3], line[2]) cursor.execute(sql) db.commit() db.close()
def _checkin(recordId): db = connect() cursor = db.cursor() sql = 'delete from mars_tianchi_artist_plays_predict where record_id = \'%s\'' % recordId cursor.execute(sql) result = np.hstack( (np.repeat(recordId, get_n_artists() * get_n_days(isX=False, isTrain=False)).reshape(-1, 1), np.loadtxt('../data/mars_tianchi_artist_plays_predict.csv', dtype='str', delimiter=','))) for line in result: sql = 'insert into mars_tianchi_artist_plays_predict values(\'%s\', \'%s\', \'%s\', %s)' % ( line[0], line[1], line[3], line[2]) cursor.execute(sql) db.commit() db.close()
cursor = db.cursor() sql = 'alter table mars_tianchi_features drop column %s' % name try: cursor.execute(sql) except Exception, e: print 'ignore drop column error !!!' sql = 'alter table mars_tianchi_features add column (%s float)' % name cursor.execute(sql) beginXTrain = getBorder(isBegin=True, isX=True, isTrain=True) endXTrain = getBorder(isBegin=False, isX=True, isTrain=True) beginXTest = getBorder(isBegin=True, isX=True, isTrain=False) endXTest = getBorder(isBegin=False, isX=True, isTrain=False) beginYTrain = getBorder(isBegin=True, isX=False, isTrain=True) endYTrain = getBorder(isBegin=False, isX=False, isTrain=True) n_X_days = get_n_days(isX=True, isTrain=True) n_y_days = get_n_days(isX=False, isTrain=True) n_artists = get_n_artists() n_series = get_n_series() artistIdList, playsTrainList = getSeries('s_plays', begin=beginYTrain, end=endYTrain) artistIdList, valTrainList = getSeries(name, begin=beginXTrain, end=endXTrain) artistIdList, valTestList = getSeries(name, begin=beginXTest, end=endXTest) for i in range(n_artists): artistId = artistIdList[i * n_y_days] print '[artist]', name, artistId, i