def evalColumns(columns): overallY = [] overallPred = [] for location in locations: location2s = [l for l in locations if l != location] print("Location: " + str(location) + ", location2: " + str(location2s)) # generating testPreds testPreds = {} for datagroup in topDatagroups: tag, features = getTagAndFeatures(datagroup) trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) testPreds[tag] = prediction trainPreds = defaultdict(list) for datagroup in topDatagroups: tag, features = getTagAndFeatures(datagroup) print("\ttag: " + str(tag) + ", features: " + str(features)) for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42) model.fit(trainX1, trainY1) train1Prediction = model.predict(trainX1) train2Prediction = model.predict(trainX2) testPrediction = model.predict(testX) train1Rmse = str(rmseEval(trainY1, train1Prediction)[1]) train2Rmse = str(rmseEval(trainY2, train2Prediction)[1]) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\t\ttrain1 rmse: " + train1Rmse) print("\t\ttrain2 rmse: " + train2Rmse) print("\t\ttest rmse: " + testRmse) for x in train2Prediction: trainPreds[tag].append(x) # get combined train2y combinedTrain2Y = [] for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target") combinedTrain2Y = combinedTrain2Y + trainY2 # calculate labels labelTrain2Y = [] for i in range(0, len(combinedTrain2Y)): bestModel = 0 bestAbs = abs(combinedTrain2Y[i] - trainPreds[topTags[0]][i]) for j in range(0, len(topTags)): tag = topTags[j] modelAbs = abs(combinedTrain2Y[i] - trainPreds[tag][i]) if modelAbs < bestAbs: bestAbs = modelAbs bestModel = j labelTrain2Y.append(bestModel) # generating testX _, testX, _, _ = splitDataForXValidation(location, "location", data, all_features, "target") # trainX2 tX2 = [] for location2 in location2s: _, trainX2, _, _, _, _ = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target") for row in trainX2: tX2.append(row) for tag in topTags: for i in range(0, len(trainPreds[tag])): tX2[i].append(trainPreds[tag][i]) reducedTrainX2 = [] for d in tX2: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTrainX2.append(reducedD) model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15) model.fit(reducedTrainX2, labelTrain2Y) for tag in topTags: for i in range(0, len(testPreds[tag])): testX[i].append(testPreds[tag][i]) reducedTestX = [] for d in testX: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTestX.append(reducedD) pred = model.predict(reducedTestX) finalPrediction = [] for i in range(0, len(testY)): p = testPreds[topTags[pred[i]]][i] finalPrediction.append(p) rmse = str(rmseEval(testY, finalPrediction)[1]) print("\tRMSE: " + str(rmse)) for x in testY: overallY.append(x) for x in finalPrediction: overallPred.append(x) rmse = rmseEval(overallPred, overallY)[1] return rmse
if isinstance(data[i], list): for j in range(0, len(data[i])): if j != 0: output.write(",") output.write(str(data[i][j])) else: output.write(str(data[i])) output.write("\n") output.close() top16datagroups = [] data_groups = generateAllDataGroups() for tag in top16tags: for datagroup in data_groups: dgtag, _ = getTagAndFeatures(datagroup) if dgtag == tag: top16datagroups.append(datagroup) break all_tags, all_features = getTagAndFeatures(['T', 'W', 'A', 'R', 'L', 'B']) for location in locations: print("Location: " + str(location)) trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled( location, "location", sampleRate, 42, data, all_features, "target") writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainX.csv", all_features, trainX2) writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testX.csv", all_features, testX)
all_features = ['hour', 'day_of_week', 'month', 'bank_holiday', 'race_day', 'winddirection', 'windspeed', 'temperature', 'rain', 'pressure', 'atc', 'lane_length', 'length', 'landuse_area', 'leisure_area', 'buildings_area', 'buildings_number'] topTags = ['TW','TWA', 'TWL', 'WA'] topPreds = ["pred_" + tag for tag in topTags] locations = [2.0, 3.0, 4.0, 6.0, 8.0] all_columns = all_features + topPreds topDatagroups = [] data_groups = generateAllDataGroups() for tag in topTags: for datagroup in data_groups: dgtag, _ = getTagAndFeatures(datagroup) if dgtag == tag: topDatagroups.append(datagroup) break def evalColumns(columns): overallY = [] overallPred = [] for location in locations: location2s = [l for l in locations if l != location] print("Location: " + str(location) + ", location2: " + str(location2s)) # generating testPreds
from ex27.ex27_lib import generateAllDataGroups, getTagAndFeatures from collections import defaultdict DATA_FILE = "/data/york3_hour_2013.csv" OUTPUT_DIRECTORY = "/experiments/ex27/" locations = [2.0, 3.0, 4.0, 6.0, 8.0] data = {} columns = [] loadData(DATA_FILE, ['timestamp'], data, columns) sampleRate = 0.75 data_groups = generateAllDataGroups() tags = [getTagAndFeatures(datagroup)[0] for datagroup in data_groups] top10tags = ['TW', 'TWA', 'W', 'TWL', 'TWB', 'T', 'WA', 'WB', 'TA', 'A'] overAllFreq = defaultdict(lambda: 0) overAllFreqT16 = defaultdict(lambda: 0) for location in locations: print("Location: " + str(location)) trainPreds = {} testPreds = {} t2Y = None tY = None for datagroup in data_groups: tag, features = getTagAndFeatures(datagroup)