def evalColumns(columns): overallY = [] overallPred = [] for location in locations: location2s = [l for l in locations if l != location] print("Location: " + str(location) + ", location2: " + str(location2s)) # generating testPreds testPreds = {} for datagroup in topDatagroups: tag, features = getTagAndFeatures(datagroup) trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) testPreds[tag] = prediction trainPreds = defaultdict(list) for datagroup in topDatagroups: tag, features = getTagAndFeatures(datagroup) print("\ttag: " + str(tag) + ", features: " + str(features)) for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42) model.fit(trainX1, trainY1) train1Prediction = model.predict(trainX1) train2Prediction = model.predict(trainX2) testPrediction = model.predict(testX) train1Rmse = str(rmseEval(trainY1, train1Prediction)[1]) train2Rmse = str(rmseEval(trainY2, train2Prediction)[1]) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\t\ttrain1 rmse: " + train1Rmse) print("\t\ttrain2 rmse: " + train2Rmse) print("\t\ttest rmse: " + testRmse) for x in train2Prediction: trainPreds[tag].append(x) # get combined train2y combinedTrain2Y = [] for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target") combinedTrain2Y = combinedTrain2Y + trainY2 # calculate labels labelTrain2Y = [] for i in range(0, len(combinedTrain2Y)): bestModel = 0 bestAbs = abs(combinedTrain2Y[i] - trainPreds[topTags[0]][i]) for j in range(0, len(topTags)): tag = topTags[j] modelAbs = abs(combinedTrain2Y[i] - trainPreds[tag][i]) if modelAbs < bestAbs: bestAbs = modelAbs bestModel = j labelTrain2Y.append(bestModel) # generating testX _, testX, _, _ = splitDataForXValidation(location, "location", data, all_features, "target") # trainX2 tX2 = [] for location2 in location2s: _, trainX2, _, _, _, _ = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target") for row in trainX2: tX2.append(row) for tag in topTags: for i in range(0, len(trainPreds[tag])): tX2[i].append(trainPreds[tag][i]) reducedTrainX2 = [] for d in tX2: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTrainX2.append(reducedD) model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15) model.fit(reducedTrainX2, labelTrain2Y) for tag in topTags: for i in range(0, len(testPreds[tag])): testX[i].append(testPreds[tag][i]) reducedTestX = [] for d in testX: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTestX.append(reducedD) pred = model.predict(reducedTestX) finalPrediction = [] for i in range(0, len(testY)): p = testPreds[topTags[pred[i]]][i] finalPrediction.append(p) rmse = str(rmseEval(testY, finalPrediction)[1]) print("\tRMSE: " + str(rmse)) for x in testY: overallY.append(x) for x in finalPrediction: overallPred.append(x) rmse = rmseEval(overallPred, overallY)[1] return rmse
pred.append(prediction_twa[i]) return pred allObservations = [] allPredictions = [] allPredictionsTW = [] allPredictionsTWA = [] for location in locations: location2s = [l for l in locations if l != location] log("Location: " + str(location) + ", location2: " + str(location2s)) # tw_4stations trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, tw_features, "target") allObservations.extend(testY) model = create_model() model.fit(trainX, trainY) predictionTW = model.predict(testX) rmse = rmseEval(testY, predictionTW)[1] log("\tTW:" + str(rmse)) allPredictionsTW.extend(predictionTW) # tw_4stations trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, twa_features, "target") model = create_model() model.fit(trainX, trainY) predictionTWA = model.predict(testX) rmse = rmseEval(testY, predictionTWA)[1]
for datagroup in data_groups: dgtag, _ = getTagAndFeatures(datagroup) if dgtag == tag: top10datagroups.append(datagroup) break for location in locations: location2 = [l for l in locations if l != location][0] print("Location: " + str(location) + ", location2: " + str(location2)) # generating testPreds testPreds = {} for datagroup in top10datagroups: tag, features = getTagAndFeatures(datagroup) trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) testPreds[tag] = prediction trainPreds = {} t2Y = None tY = None for datagroup in top10datagroups: tag, features = getTagAndFeatures(datagroup)
locations = [2.0, 3.0, 4.0, 6.0, 8.0] data = {} columns = ['timestamp', 'location'] loadData(DATA_FILE, [], data, columns) columnsTW = [ 'hour', 'day_of_week', 'month', 'bank_holiday', 'race_day', 'winddirection', 'windspeed', 'temperature', 'rain', 'pressure' ] for location in locations: print("location: " + str(location)) # save down trainX, trainY, testX, testY trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, columns, "target") print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) testPrediction = model.predict(testX) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\tRFR+All rmse: " + str(testRmse)) trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, columnsTW, "target") print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59,
def eval_one(step): eval_features = [] for i in range(0, len(all_features)): if step[i]: eval_features.append(all_features[i]) allObservations = [] allPredictions = [] allPredictionsTW = [] allPredictionsTWA = [] allLabel = [] allLabelPrediction = [] for location in locations: location2s = [l for l in locations if l != location] # tw_4stations trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, tw_features, "target") allObservations.extend(testY) model = create_model() model.fit(trainX, trainY) predictionTW = model.predict(testX) allPredictionsTW.extend(predictionTW) # tw_4stations trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, twa_features, "target") model = create_model() model.fit(trainX, trainY) predictionTWA = model.predict(testX) allPredictionsTWA.extend(predictionTWA) #combination classifier_X = [] classifier_Y = [] for loc in location2s: # tw_3stations trainX, testX, trainY, testY = splitDataForXValidationForCombination( loc, location, "location", data, tw_features, "target") model = create_model() model.fit(trainX, trainY) prediction_3station_TW = model.predict(testX) # twa_3stations trainX, testX, trainY, testY = splitDataForXValidationForCombination( loc, location, "location", data, twa_features, "target") model = create_model() model.fit(trainX, trainY) prediction_3station_TWA = model.predict(testX) trainX, testX, trainY, testY = splitDataForXValidationForCombination( loc, location, "location", data, eval_features, "target") classifier_X.extend(testX) label = generate_label(testY, prediction_3station_TW, prediction_3station_TWA) classifier_Y.extend(label) model = create_classifier_model() model.fit(classifier_X, classifier_Y) _, testX, _, testY = splitDataForXValidation(location, "location", data, eval_features, "target") classifier_prediction = model.predict(testX) classifier_testLabel = generate_label(testY, predictionTW, predictionTWA) allLabel.extend(classifier_testLabel) allLabelPrediction.extend(classifier_prediction) combined_prediction = generate_combined_prediction( classifier_prediction, predictionTW, predictionTWA) allPredictions.extend(combined_prediction) rmse = rmseEval(allObservations, allPredictions)[1] accuracy = calculate_accuracy(allLabel, allLabelPrediction) return rmse, accuracy