def eval_one(min_samples_leaf, n_estimators): log("min_samples_leaf: " + str(min_samples_leaf) + ", n_estimators: " + str(n_estimators)) all_observations = [] all_pred_ALL = [] for group in range(0, len(groups)): trainStations = [] for i in range(0, len(groups)): if i != group: trainStations.extend(groups[i]) testStations = groups[group] train_station_set = set([float(s) for s in trainStations]) test_station_set = set([float(s) for s in testStations]) trainX, testX, trainY, testY = splitDataForXValidation( train_station_set, test_station_set, "location", data, all_features, "target") model = RandomForestRegressor(min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction_ALL = model.predict(testX) rmse = rmseEval(testY, prediction_ALL)[1] log("\tALL rmse: " + str(rmse)) all_observations.extend(testY) all_pred_ALL.extend(prediction_ALL) rmse = rmseEval(all_observations, all_pred_ALL)[1] log("\tALL rmse:" + str(rmse)) return rmse
def doPrediction(locations, data, columns, features, columns2, outputFileName): predictionData = {} for c in columns2: predictionData[c] = [] # modelling for location in locations: trainX, testX, trainY, testY, dataY = splitDataForXValidation( location, "location", data, features, columns, "target") print("\tT+W #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=2, n_estimators=650, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction)[1] print("\trmse: " + str(rmse)) for c in columns2: if c == 'prediction': predictionData[c].extend(prediction) else: predictionData[c].extend(dataY[c]) for c in predictionData: print("\t" + c + " -> #" + str(len(predictionData[c]))) rmse = rmseEval(predictionData['target'], predictionData['prediction'])[1] print("overall RMSE: " + str(rmse)) print("Writing out results...") output = open(outputFileName, 'w') output.write(','.join([str(x) for x in columns2])) output.write("\n") for i in range(0, len(predictionData['target'])): output.write(str(predictionData[columns2[0]][i])) for j in range(1, len(columns2)): output.write(",") output.write(str(predictionData[columns2[j]][i])) output.write("\n") output.close() print("Done...")
def eval_one(step): if step in cached_results: return cached_results[step] eval_features = [] for i in range(0, len(all_features)): if step[i]: eval_features.append(all_features[i]) all_predictions = [] all_observations = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, eval_features, "target") model = RandomForestRegressor(min_samples_leaf = 2, random_state=42, n_estimators=650, n_jobs=-1) model.fit(trainX, trainY) predictions = model.predict(testX) all_observations.extend(testY) all_predictions.extend(predictions) rmse = rmseEval(all_observations, all_predictions)[1] cached_results[step] = rmse # save down the cached result cache_output = open(CACHE_FILE, "a") step_list = [str(s) for s in step] step_str = ",".join(step_list) cache_output.write(str(rmse) + ";" + step_str + "\n") cache_output.close() return rmse
def evalOne(parameters): all_obs = [] all_pred = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, all_features, "target") if "depth" in parameters: model = RandomForestRegressor( max_depth=parameters["depth"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) elif "leaf" in parameters: model = RandomForestRegressor( min_samples_leaf=parameters["leaf"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) elif "max_leaf" in parameters: model = RandomForestRegressor( max_leaf_nodes=parameters["max_leaf"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) model.fit(trainX, trainY) prediction = model.predict(testX) all_obs.extend(testY) all_pred.extend(prediction) return rmseEval(all_obs, all_pred)[1]
def evalTrainStationTestStation(trainStation, testStation, features): trainX, _, trainY, _ = splitDataForXValidation(set([trainStation]), set(), "location", dataByStation[trainStation], features, "target") _, testX2, _, testY2 = splitDataForXValidation(set(), set([testStation]), "location", dataByStation[testStation], features, "target") model = RandomForestRegressor(max_depth=10, n_estimators = 60, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX2) rmse = rmseEval(testY2, prediction)[1] print("Training on station " + str(trainStation) + ", applying on station " + str(testStation) + ": rmse: " + str(rmse)) return rmse
def evalOne(parameters): all_obs = [] all_pred = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, all_features, "target") normalizer_X = StandardScaler() trainX = normalizer_X.fit_transform(trainX) testX = normalizer_X.transform(testX) normalizer_Y = StandardScaler() trainY = normalizer_Y.fit_transform(trainY) testY = normalizer_Y.transform(testY) layers = [] for _ in range(0, parameters["hidden_layers"]): layers.append( Layer(parameters["hidden_type"], units=parameters["hidden_neurons"])) layers.append(Layer("Linear")) model = Regressor(layers=layers, learning_rate=parameters["learning_rate"], n_iter=parameters["iteration"], random_state=42) X = np.array(trainX) y = np.array(trainY) model.fit(X, y) model.fit(trainX, trainY) prediction = model.predict(testX) prediction = normalizer_Y.inverse_transform(prediction) testY = normalizer_Y.inverse_transform(testY) print("location: " + str(location) + " -> " + str(rmseEval(prediction, testY)[1])) all_obs.extend(testY) all_pred.extend(prediction) return rmseEval(all_obs, all_pred)[1]
def rf(week, timestampWeekCategory, stationNames, ospmData2013, ospmData2014, data2013, data2014): columns = [] for c in data2013: columns.append(c) columns.remove("location") columns.remove("timestamp") columns.remove("target") X = [] y = [] for i in range(0, len(data2013["target"])): timestamp = str(int(data2013["timestamp"][i])) weekC = timestampWeekCategory[timestamp] if int(weekC) >= week: y.append(data2013["target"][i]) x = [] for c in columns: x.append(data2013[c][i]) X.append(x) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(X, y) # print(str(len(X))) X = [] y = [] for i in range(0, len(data2014["target"])): y.append(data2014["target"][i]) x = [] for c in columns: x.append(data2014[c][i]) X.append(x) prediction = model.predict(X) rmse = rmseEval(y, prediction) return rmse
def evalOne(parameters): all_obs = [] all_pred = [] # all_obs_train = [] # all_pred_train = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target") model = KNeighborsRegressor(weights = parameters["weights"], n_neighbors = parameters["neighbors"], p = parameters["p"]) model.fit(trainX, trainY) # train_prediction = model.predict(trainX) prediction = model.predict(testX) all_obs.extend(testY) all_pred.extend(prediction) # all_obs_train.extend(trainY) # all_pred_train.extend(train_prediction) return rmseEval(all_obs, all_pred)[1]
def eval_one(features): all_predictions = [] all_observations = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf=2, random_state=42, n_estimators=650, n_jobs=-1) model.fit(trainX, trainY) predictions = model.predict(testX) all_observations.extend(testY) all_predictions.extend(predictions) rmse = rmseEval(all_observations, all_predictions)[1] log("\tRMSE: " + str(rmse))
def evalOne(parameters): all_obs = [] all_pred = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target") normalizer_X = StandardScaler() trainX = normalizer_X.fit_transform(trainX) testX = normalizer_X.transform(testX) normalizer_Y = StandardScaler() trainY = normalizer_Y.fit_transform(trainY) testY = normalizer_Y.transform(testY) model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=parameters["C"], cache_size=5000), max_samples=parameters["max_samples"],n_estimators=parameters["n_estimators"], verbose=0, n_jobs=-1) model.fit(trainX, trainY) prediction = model.predict(testX) prediction = normalizer_Y.inverse_transform(prediction) testY = normalizer_Y.inverse_transform(testY) all_obs.extend(testY) all_pred.extend(prediction) return rmseEval(all_obs, all_pred)[1]
def evaluateFeatures(vector, features, data): featureToUse = [] for i in range(len(vector)): if vector[i] == 1: featureToUse.append(features[i]) combinedRmse = [] # modelling for location in locationValues: trainX, testX, trainY, testY = splitDataForXValidation2( location, "location", data, featureToUse, "target") model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction) combinedRmse.append(rmse[1]) # calculate avg rmse avgRmse = 0.0 for rmse in combinedRmse: avgRmse = avgRmse + rmse avgRmse = avgRmse / len(combinedRmse) return avgRmse
allPredictionsTW = [] allPredictionsTWA = [] for location in locations: location2s = [l for l in locations if l != location] log("Location: " + str(location) + ", location2: " + str(location2s)) # tw_4stations trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, tw_features, "target") allObservations.extend(testY) model = create_model() model.fit(trainX, trainY) predictionTW = model.predict(testX) rmse = rmseEval(testY, predictionTW)[1] log("\tTW:" + str(rmse)) allPredictionsTW.extend(predictionTW) # tw_4stations trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, twa_features, "target") model = create_model() model.fit(trainX, trainY) predictionTWA = model.predict(testX) rmse = rmseEval(testY, predictionTWA)[1] log("\tTWA:" + str(rmse)) allPredictionsTWA.extend(predictionTWA) #combination classifier_X = []
columns = [] loadData("/data/york_hour_2013.csv", ["timestamp", "atc"], data, columns) print(str(columns)) all_features = deepcopy(columns) all_features.remove("target") all_features.remove("location") # remove to decrease rmse from 10000000.0... all_features.remove('buildings_area') all_features.remove('leisure_area') all_obs = [] all_pred = [] for location in locations: print("Location: " + str(location)) trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target") model = linear_model.LinearRegression(True, True, True, -1) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = str(rmseEval(testY, prediction)[1]) print("\tRmse:" + rmse) all_obs.extend(testY) all_pred.extend(prediction) print("Overall:") rmse = str(rmseEval(all_obs, all_pred)[1]) print("Rmse:" + rmse)
def eval_one(step): if step in cached_results: return cached_results[step] eval_features = [] for i in range(0, len(all_features)): if step[i]: eval_features.append(all_features[i]) all_observations = [] all_pred_combined = [] for group in range(0, len(groups)): train_stations, test_stations = generate_train_test_station_list( group, groups) train_station_set = set([float(s) for s in train_stations]) test_station_set = set([float(s) for s in test_stations]) train_lower = [ float(train_stations[i]) for i in range(0, len(train_stations)) if i < (len(train_stations) / 2.0) ] train_lower_set = set(train_lower) train_upper = [ float(train_stations[i]) for i in range(0, len(train_stations)) if i >= (len(train_stations) / 2.0) ] train_upper_set = set(train_upper) test_lower = [ float(test_stations[i]) for i in range(0, len(test_stations)) if i < (len(test_stations) / 2.0) ] # tw_lower trainX, testX, trainY, testY = splitDataForXValidation( train_lower_set, test_station_set, "location", data, tw_features, "target") model = create_model() model.fit(trainX, trainY) prediction_lower = model.predict(testX) # tw_upper trainX, testX, trainY, testY = splitDataForXValidation( train_upper_set, test_station_set, "location", data, tw_features, "target") model = create_model() model.fit(trainX, trainY) prediction_upper = model.predict(testX) trainX, testX, trainY, testY, train_location, test_location = splitDataForXValidationWithLocation( train_station_set, test_station_set, "location", data, eval_features, "target") train_label = generate_label(train_location, train_lower) test_label = generate_label(test_location, test_lower) model = create_classifier_model() model.fit(trainX, train_label) prediction_label = model.predict(testX) pred_combined = generate_combined_prediction(prediction_label, prediction_lower, prediction_upper) all_pred_combined.extend(pred_combined) all_observations.extend(testY) rmse = rmseEval(all_observations, all_pred_combined)[1] cached_results[step] = rmse # save down the cached result cache_output = open(CACHE_FILE, "a") step_list = [str(s) for s in step] step_str = ",".join(step_list) cache_output.write(str(rmse) + ";" + step_str + "\n") cache_output.close() return rmse
log("\ttrainStations: " + str(trainStations)) log("\ttestStations: " + str(testStations)) train_station_set = set([float(s) for s in trainStations]) test_station_set = set([float(s) for s in testStations]) trainX, testX, trainY, testY = splitDataForXValidation( train_station_set, test_station_set, "location", data, tw_features, "target") model = RandomForestRegressor(min_samples_leaf=29, n_estimators=64, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction_TW = model.predict(testX) rmse = rmseEval(testY, prediction_TW)[1] log("\tTW rmse: " + str(rmse)) all_observations.extend(testY) all_pred_TW.extend(prediction_TW) trainX, testX, trainY, testY = splitDataForXValidation( train_station_set, test_station_set, "location", data, twa_features, "target") model = RandomForestRegressor(min_samples_leaf=29, n_estimators=64, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction_TWA = model.predict(testX) rmse = rmseEval(testY, prediction_TWA)[1] log("\tTWA rmse: " + str(rmse))
errorsTimestamps = {} eData = {} for m in models: errors = {} for l in locations: errors[l] = 0 errorsTimestamps[m] = set() records = len(data[m]['target']) print("Overall:") print("\t" + "#records: " + str(records)) rmse = rmseEval(data[m]['target'], data[m]['prediction'])[1] print("\t" + "rmse: " + str(rmse)) absoluteError = ae(data[m]['target'], data[m]['prediction']) absoluteError.sort() eData[m] = absoluteError # error without records have ae > 20.0 data2 = {} for c in columns[m]: data2[c] = [] for i in range(0, records): if abs(data[m]['target'][i] - data[m]['prediction'][i]) < 20.0: for c in columns[m]:
trainStationList = [s for s in all_stations if float(s) in trainStations] log(output_log, "\ttrainStationList:" + str(trainStationList)) testStations = set(float(station) for station in testStationList) trainX, testX, trainY, testY = splitDataForXValidation( trainStations, testStations, "location", data, features_TW, "target") log(output_log, "\tTW #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction_TW = model.predict(testX) rmse = rmseEval(testY, prediction_TW)[1] log(output_log, "\trmse: " + str(rmse)) obs.extend(testY) all_pred_TW.extend(prediction_TW) trainX, testX, trainY, testY, trainLocation, testLocation = splitDataForXValidationWithLocation( trainStations, testStations, "location", data, columns, "target") train_lower = [ float(trainStationList[i]) for i in range(0, len(trainStationList)) if i < (len(trainStationList) / 2.0) ] train_upper = [ float(trainStationList[i]) for i in range(0, len(trainStationList)) if i >= (len(trainStationList) / 2.0) ]
predictionsNormal[method].append(p) observationsNormal[method].append(o) rmseLevels = {} maeLevels = {} rLevels = {} fac2Levels = {} nmseLevels = {} fbLevels = {} rsLevels = {} mgLevels = {} vgLevels = {} for method in methods: print("Method: " + method) rmse = rmseEval(observations[method], predictions[method])[1] print("\trmse: " + str(rmse)) mae = maeEval(observations[method], predictions[method])[1] print("\tmae: " + str(mae)) r = correlationEval(observations[method], predictions[method])[1] print("\tr: " + str(r)) print("\tr2: " + str(rsquaredEval(observations[method], predictions[method])[1])) print("\tr2: " + str(r2_score(observations[method], predictions[method]))) fac2 = fac2Eval(observations[method], predictions[method]) print("\tfac2: " + str(fac2)) print("\tmg: " + str(mgEval(observations[method], predictions[method]))) nmse = nmse_from_paper(observations[method], predictions[method]) print("\tnmse: " + str(nmse)) fb = fbEval(observations[method], predictions[method])[1] print("\tfb: " + str(fb))
def doEval(landuse, topo, traffic_static, traffic_dynamic, weather, time, output): if landuse == False and topo == False and traffic_dynamic == False and traffic_static == False and weather == False and time == False: return groupName = "lu" if landuse == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "to" if topo == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ts" if traffic_static == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "td" if traffic_dynamic == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "we" if weather == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ti" if time == True: groupName = groupName + "1" else: groupName = groupName + "0" print("Group: " + groupName) features = [] if landuse: features.append('leisure_area') features.append('landuse_area') if topo: features.append('buildings_number') features.append('buildings_area') if traffic_static: features.append('lane_length') features.append('length') if traffic_dynamic: features.append('traffic_length_car') features.append('traffic_length_lgv') features.append('traffic_length_hgv') if weather: features.append('winddirection') features.append('windspeed') features.append('temperature') features.append('rain') features.append('pressure') if time: features.append('hour') features.append('day_of_week') features.append('month') features.append('bank_holiday') features.append('race_day') all_obs = [] all_prediction = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf=2, random_state=42, n_estimators=650, n_jobs=-1) model.fit(trainX, trainY) prediction = model.predict(testX) all_obs.extend(testY) all_prediction.extend(prediction) rmse = rmseEval(all_obs, all_prediction)[1] output.write(str(groupName) + "," + str(rmse) + "\n") output.flush()
def doEval(landuse, topo, traffic_static, traffic_dynamic, weather, time, output): if landuse == False and topo == False and traffic_dynamic == False and traffic_static == False and weather == False and time == False: return groupName = "lu" if landuse == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "to" if topo == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ts" if traffic_static == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "td" if traffic_dynamic == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "we" if weather == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ti" if time == True: groupName = groupName + "1" else: groupName = groupName + "0" print("Group: " + groupName) columnsToSkip = ['timestamp'] if landuse == False: columnsToSkip.append('leisure_area') columnsToSkip.append('landuse_area') if topo == False: columnsToSkip.append('buildings_number') columnsToSkip.append('buildings_area') if traffic_static == False: columnsToSkip.append('lane_length') columnsToSkip.append('length') if traffic_dynamic == False: columnsToSkip.append('atc') if weather == False: columnsToSkip.append('winddirection') columnsToSkip.append('windspeed') columnsToSkip.append('temperature') columnsToSkip.append('rain') columnsToSkip.append('pressure') if time == False: columnsToSkip.append('hour') columnsToSkip.append('day_of_week') columnsToSkip.append('month') columnsToSkip.append('bank_holiday') columnsToSkip.append('race_day') columns = [] data = {} loadData(DATA_FILE, columnsToSkip, data, columns) # modelling for location in locations: print("Location: " + str(location)) trainX, testX, trainY, testY = splitDataForXValidation1( location, "location", data, columns, "target") print("\tRFR #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction)[1] print("\trmse: " + str(rmse)) output.write(str(groupName) + "," + str(rmse) + "\n")
def evalColumns(columns): # log("Evaluating " + str([all_columns[i] for i in range(0, len(all_columns)) if columns[i]])) overallY = [] overallPred = [] for location in locations: trainX = loadX(INPUT_DIRECTORY + "z_" + str(int(location)) + "_trainX.csv", all_features) trainY = loadSingleColumnsFile(INPUT_DIRECTORY + "z_" + str(int(location)) + "_trainY.csv") trainPreds = [] for tag in top4tags: p = loadSingleColumnsFile(INPUT_DIRECTORY + "z_" + str(int(location)) + "_trainPred_" + tag + ".csv") for i in range(0, len(p)): trainX[i].append(p[i]) trainPreds.append(p) labelY = [] for i in range(0, len(trainY)): bestAbs = abs(trainY[i] - trainPreds[0][i]) bestIndex = 0 for j in range(0, len(top4tags)): modelAbs = abs(trainY[i] - trainPreds[j][i]) if modelAbs < bestAbs: bestAbs = modelAbs bestIndex = j labelY.append(bestIndex) # reduce trainX reducedTrainX = [] for d in trainX: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTrainX.append(reducedD) model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15) model.fit(reducedTrainX, labelY) testX = loadX(INPUT_DIRECTORY + "z_" + str(int(location)) + "_testX.csv", all_features) testY = loadSingleColumnsFile(INPUT_DIRECTORY + "z_" + str(int(location)) + "_testY.csv") testPreds = [] for tag in top4tags: p = loadSingleColumnsFile(INPUT_DIRECTORY + "z_" + str(int(location)) + "_testPred_" + tag + ".csv") for i in range(0, len(p)): testX[i].append(p[i]) testPreds.append(p) reducedTestX = [] for d in testX: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTestX.append(reducedD) testPredY = model.predict(reducedTestX) prediction = [] for i in range(0, len(testPredY)): p = testPreds[testPredY[i]][i] prediction.append(p) overallY = overallY + testY overallPred = overallPred + prediction rmse = rmseEval(overallPred, overallY)[1] return rmse
print("\ttrainStationList:" + str(trainStationList)) trainStationList = [s for s in all_stations if float(s) in trainStations] print("\ttrainStationList:" + str(trainStationList)) testStations = set(float(station) for station in testStationList) trainX, testX, trainY, testY = splitDataForXValidation( trainStations, testStations, "location", data, features_TW, "target") print("\tTW #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction)[1] print("\trmse: " + str(rmse)) # # trainX, testX, trainY, testY = splitDataForXValidation(trainStations, testStations, "location", data, features_TWA, "target") # print("\tTWA #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) # model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42) # model.fit(trainX, trainY) # prediction = model.predict(testX) # rmse = rmseEval(testY, prediction)[1] # print("\trmse: " + str(rmse)) # # trainX, testX, trainY, testY = splitDataForXValidation(trainStations, testStations, "location", data, features_ALL, "target") # print("\tALL #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) # model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42) # model.fit(trainX, trainY) # prediction = model.predict(testX)
def eval_one(step): eval_features = [] for i in range(0, len(all_features)): if step[i]: eval_features.append(all_features[i]) all_observations = [] all_pred_combined = [] all_label = [] all_pred_label = [] for group in range(0, len(groups)): train_stations, test_stations = generate_train_test_station_list(group, groups) train_station_set = set([float(s) for s in train_stations]) test_station_set = set([float(s) for s in test_stations]) trainX, testX, trainY, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, tw_features, "target") model = RandomForestRegressor(min_samples_leaf = 29, n_estimators = 64, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction_TW = model.predict(testX) rmse = rmseEval(testY, prediction_TW)[1] all_observations.extend(testY) trainX, testX, trainY, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, twa_features, "target") model = RandomForestRegressor(min_samples_leaf = 29, n_estimators = 64, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction_TWA = model.predict(testX) rmse = rmseEval(testY, prediction_TWA)[1] group2s = [groups[i] for i in range(0, len(groups)) if i != group] #combination classifier_X = [] classifier_Y = [] for group2 in range(0, len(group2s)): train_stations, test_stations = generate_train_test_station_list(group2, group2s) train_station_set = set([float(s) for s in train_stations]) test_station_set = set([float(s) for s in test_stations]) trainX, testX, trainY, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, tw_features, "target") model = RandomForestRegressor(min_samples_leaf = 29, n_estimators = 64, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction_3groups_TW = model.predict(testX) trainX, testX, trainY, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, twa_features, "target") model = RandomForestRegressor(min_samples_leaf = 29, n_estimators = 64, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction_3groups_TWA = model.predict(testX) trainX, testX, trainY, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, eval_features, "target") classifier_X.extend(testX) label = generate_label(testY, prediction_3groups_TW, prediction_3groups_TWA) classifier_Y.extend(label) train_stations, test_stations = generate_train_test_station_list(group, groups) train_station_set = set([float(s) for s in train_stations]) test_station_set = set([float(s) for s in test_stations]) model = create_classifier_model() model.fit(classifier_X, classifier_Y) _, testX, _, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, eval_features, "target") classifier_prediction = model.predict(testX) test_label = generate_label(testY, prediction_TW, prediction_TWA) all_label.extend(test_label) all_pred_label.extend(classifier_prediction) combined_prediction = generate_combined_prediction(classifier_prediction, prediction_TW, prediction_TWA) rmse = rmseEval(testY, combined_prediction)[1] all_pred_combined.extend(combined_prediction) rmse = rmseEval(all_observations, all_pred_combined)[1] accuracy = calculate_accuracy(all_label, all_pred_label) return rmse, accuracy
def evalColumns(columns): overallY = [] overallPred = [] for location in locations: location2s = [l for l in locations if l != location] print("Location: " + str(location) + ", location2: " + str(location2s)) # generating testPreds testPreds = {} for datagroup in topDatagroups: tag, features = getTagAndFeatures(datagroup) trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) testPreds[tag] = prediction trainPreds = defaultdict(list) for datagroup in topDatagroups: tag, features = getTagAndFeatures(datagroup) print("\ttag: " + str(tag) + ", features: " + str(features)) for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42) model.fit(trainX1, trainY1) train1Prediction = model.predict(trainX1) train2Prediction = model.predict(trainX2) testPrediction = model.predict(testX) train1Rmse = str(rmseEval(trainY1, train1Prediction)[1]) train2Rmse = str(rmseEval(trainY2, train2Prediction)[1]) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\t\ttrain1 rmse: " + train1Rmse) print("\t\ttrain2 rmse: " + train2Rmse) print("\t\ttest rmse: " + testRmse) for x in train2Prediction: trainPreds[tag].append(x) # get combined train2y combinedTrain2Y = [] for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target") combinedTrain2Y = combinedTrain2Y + trainY2 # calculate labels labelTrain2Y = [] for i in range(0, len(combinedTrain2Y)): bestModel = 0 bestAbs = abs(combinedTrain2Y[i] - trainPreds[topTags[0]][i]) for j in range(0, len(topTags)): tag = topTags[j] modelAbs = abs(combinedTrain2Y[i] - trainPreds[tag][i]) if modelAbs < bestAbs: bestAbs = modelAbs bestModel = j labelTrain2Y.append(bestModel) # generating testX _, testX, _, _ = splitDataForXValidation(location, "location", data, all_features, "target") # trainX2 tX2 = [] for location2 in location2s: _, trainX2, _, _, _, _ = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target") for row in trainX2: tX2.append(row) for tag in topTags: for i in range(0, len(trainPreds[tag])): tX2[i].append(trainPreds[tag][i]) reducedTrainX2 = [] for d in tX2: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTrainX2.append(reducedD) model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15) model.fit(reducedTrainX2, labelTrain2Y) for tag in topTags: for i in range(0, len(testPreds[tag])): testX[i].append(testPreds[tag][i]) reducedTestX = [] for d in testX: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTestX.append(reducedD) pred = model.predict(reducedTestX) finalPrediction = [] for i in range(0, len(testY)): p = testPreds[topTags[pred[i]]][i] finalPrediction.append(p) rmse = str(rmseEval(testY, finalPrediction)[1]) print("\tRMSE: " + str(rmse)) for x in testY: overallY.append(x) for x in finalPrediction: overallPred.append(x) rmse = rmseEval(overallPred, overallY)[1] return rmse
testY[i] - predData[tag][str(location)][str(int(testTimestamp[i]))]) if tagAbs < bestAbs: bestModel = tag bestAbs = tagAbs locationBestCounter[bestModel] = locationBestCounter[bestModel] + 1 twPred = predData["TW"][str(location)][str(int(testTimestamp[i]))] twPredictions.append(twPred) bestPred = predData[bestModel][str(location)][str(int( testTimestamp[i]))] bestPredictions.append(bestPred) # print(str(locationBestCounter)) rmse = rmseEval(testY, twPredictions)[1] print("\tTW rmse: " + str(rmse)) rmse = rmseEval(testY, bestPredictions)[1] print("\tBest rmse: " + str(rmse)) for tag in tags: bestCounter[tag] = bestCounter[tag] + locationBestCounter[tag] print("BestCounter:") orderedBestCounter = [] for tag in tags: orderedBestCounter.append((bestCounter[tag], tag)) orderedBestCounter.sort(reverse=True) for t in orderedBestCounter: print("\t" + t[1] + ": " + str(t[0]))
def eval_one(step): eval_features = [] for i in range(0, len(all_features)): if step[i]: eval_features.append(all_features[i]) all_observations = [] all_pred_combined = [] Y = [] P = [] for group in range(0, len(groups)): train_stations, test_stations = generate_train_test_station_list( group, groups) train_station_set = set([float(s) for s in train_stations]) test_station_set = set([float(s) for s in test_stations]) train_lower = [ float(train_stations[i]) for i in range(0, len(train_stations)) if i < (len(train_stations) / 2.0) ] train_lower_set = set(train_lower) train_upper = [ float(train_stations[i]) for i in range(0, len(train_stations)) if i >= (len(train_stations) / 2.0) ] train_upper_set = set(train_upper) test_lower = [ float(test_stations[i]) for i in range(0, len(test_stations)) if i < (len(test_stations) / 2.0) ] # tw_lower trainX, testX, trainY, testY = splitDataForXValidation( train_lower_set, test_station_set, "location", data, tw_features, "target") model = create_model() model.fit(trainX, trainY) prediction_lower = model.predict(testX) # tw_upper trainX, testX, trainY, testY = splitDataForXValidation( train_upper_set, test_station_set, "location", data, tw_features, "target") model = create_model() model.fit(trainX, trainY) prediction_upper = model.predict(testX) trainX, testX, trainY, testY, train_location, test_location = splitDataForXValidationWithLocation( train_station_set, test_station_set, "location", data, eval_features, "target") train_label = generate_label(train_location, train_lower) test_label = generate_label(test_location, test_lower) model = create_classifier_model() model.fit(trainX, train_label) prediction_label = model.predict(testX) pred_combined = generate_combined_prediction(prediction_label, prediction_lower, prediction_upper) all_pred_combined.extend(pred_combined) all_observations.extend(testY) Y.extend(test_label) P.extend(prediction_label) rmse = rmseEval(all_observations, all_pred_combined)[1] accuracy = accuracy_score(Y, P) return rmse, accuracy
timestampData2.append(str(int(v))) # modelling for location in locations: trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation( location, "location", data, featureTW, "target", timestampData) print("\tT+W (on data without ATC) #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction)[1] print("\trmse: " + str(rmse)) for i in range(0, len(testY)): timestamp = testTimestamp[i] value = prediction[i] TWpredictionData[str(location)][timestamp] = value trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation( location, "location", data2, featureTWAtc, "target", timestampData2) print("\tT+W+Atc #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY)
'winddirection', 'windspeed', 'temperature', 'rain', 'pressure' ] for location in locations: print("location: " + str(location)) # save down trainX, trainY, testX, testY trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, columns, "target") print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) testPrediction = model.predict(testX) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\tRFR+All rmse: " + str(testRmse)) trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, columnsTW, "target") print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) testPrediction = model.predict(testX) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\tRFR+TW rmse: " + str(testRmse)) for sr in [0.95, 0.9, 0.85, 0.8, 0.75, 0.7]:
all_features, trainX2) writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testX.csv", all_features, testX) writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainY.csv", ["target"], trainY2) writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testY.csv", ["target"], testY) for dataGroup in generateAllDataGroups(): tag, features = getTagAndFeatures(dataGroup) trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled( location, "location", sampleRate, 42, data, features, "target") model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX1, trainY1) trainPrediction = model.predict(trainX2) testPrediction = model.predict(testX) trainRmse = str(rmseEval(trainY2, trainPrediction)[1]) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\t" + tag + ": #train: " + str(len(trainY2)) + ", #test:" + str(len(testY)) + ", trainRMSE: " + trainRmse + ", testRMSE: " + testRmse) writeOutData( OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainPred_" + tag + ".csv", ["trainPred_" + tag], trainPrediction) writeOutData( OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testPred_" + tag + ".csv", ["testPred_" + tag], testPrediction)
output.write( "location,timestamp,obs,pred_TW,pred_TWA,pred_combined,combined_uses_tw_twa\n" ) output_log = open(OUTPUT_LOG_FILE, 'w') for location in locations: trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation( location, "location", data, tw_features, "target", timestampData) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) testPredictionTW = model.predict(testX) rmse = str(rmseEval(testY, testPredictionTW)[1]) log(output_log, "\tTW rmse: " + rmse) for x in testY: allObs.append(x) for x in testPredictionTW: allPredictionTW.append(x) trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation( location, "location", data, twa_features, "target", timestampData) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) testPredictionTWA = model.predict(testX) rmse = str(rmseEval(testY, testPredictionTWA)[1]) log(output_log, "\tTWA rmse: " + rmse)