Пример #1
0
def eval_one(min_samples_leaf, n_estimators):
    log("min_samples_leaf: " + str(min_samples_leaf) + ", n_estimators: " +
        str(n_estimators))

    all_observations = []
    all_pred_ALL = []

    for group in range(0, len(groups)):
        trainStations = []
        for i in range(0, len(groups)):
            if i != group:
                trainStations.extend(groups[i])
        testStations = groups[group]

        train_station_set = set([float(s) for s in trainStations])
        test_station_set = set([float(s) for s in testStations])

        trainX, testX, trainY, testY = splitDataForXValidation(
            train_station_set, test_station_set, "location", data,
            all_features, "target")
        model = RandomForestRegressor(min_samples_leaf=min_samples_leaf,
                                      n_estimators=n_estimators,
                                      n_jobs=-1,
                                      random_state=42)
        model.fit(trainX, trainY)
        prediction_ALL = model.predict(testX)
        rmse = rmseEval(testY, prediction_ALL)[1]
        log("\tALL rmse: " + str(rmse))
        all_observations.extend(testY)
        all_pred_ALL.extend(prediction_ALL)

    rmse = rmseEval(all_observations, all_pred_ALL)[1]
    log("\tALL rmse:" + str(rmse))
    return rmse
Пример #2
0
def doPrediction(locations, data, columns, features, columns2, outputFileName):
    predictionData = {}
    for c in columns2:
        predictionData[c] = []

    # modelling
    for location in locations:
        trainX, testX, trainY, testY, dataY = splitDataForXValidation(
            location, "location", data, features, columns, "target")
        print("\tT+W #train: " + str(len(trainY)) + ", #test:" +
              str(len(testY)))
        model = RandomForestRegressor(min_samples_leaf=2,
                                      n_estimators=650,
                                      n_jobs=-1,
                                      random_state=42)
        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        rmse = rmseEval(testY, prediction)[1]
        print("\trmse: " + str(rmse))

        for c in columns2:
            if c == 'prediction':
                predictionData[c].extend(prediction)
            else:
                predictionData[c].extend(dataY[c])

    for c in predictionData:
        print("\t" + c + " -> #" + str(len(predictionData[c])))

    rmse = rmseEval(predictionData['target'], predictionData['prediction'])[1]
    print("overall RMSE: " + str(rmse))

    print("Writing out results...")

    output = open(outputFileName, 'w')
    output.write(','.join([str(x) for x in columns2]))
    output.write("\n")

    for i in range(0, len(predictionData['target'])):
        output.write(str(predictionData[columns2[0]][i]))
        for j in range(1, len(columns2)):
            output.write(",")
            output.write(str(predictionData[columns2[j]][i]))
        output.write("\n")

    output.close()

    print("Done...")
Пример #3
0
def eval_one(step):
    
    if step in cached_results:
        return cached_results[step]
    
    eval_features = []
    for i in range(0, len(all_features)):
        if step[i]:
            eval_features.append(all_features[i])
    
    all_predictions = []
    all_observations = []
    
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, eval_features, "target")
        model = RandomForestRegressor(min_samples_leaf = 2, random_state=42, n_estimators=650, n_jobs=-1)
        model.fit(trainX, trainY)
        predictions = model.predict(testX)
        all_observations.extend(testY)
        all_predictions.extend(predictions)
    
    rmse = rmseEval(all_observations, all_predictions)[1]
    
    cached_results[step] = rmse
    
    # save down the cached result
    
    cache_output = open(CACHE_FILE, "a")
    step_list = [str(s) for s in step]
    step_str = ",".join(step_list)  
    cache_output.write(str(rmse) + ";" + step_str + "\n")
    cache_output.close()
    
    return rmse
Пример #4
0
def evalOne(parameters):
    all_obs = []
    all_pred = []
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, all_features, "target")
        if "depth" in parameters:
            model = RandomForestRegressor(
                max_depth=parameters["depth"],
                random_state=42,
                n_estimators=parameters["n_estimators"],
                n_jobs=-1)
        elif "leaf" in parameters:
            model = RandomForestRegressor(
                min_samples_leaf=parameters["leaf"],
                random_state=42,
                n_estimators=parameters["n_estimators"],
                n_jobs=-1)
        elif "max_leaf" in parameters:
            model = RandomForestRegressor(
                max_leaf_nodes=parameters["max_leaf"],
                random_state=42,
                n_estimators=parameters["n_estimators"],
                n_jobs=-1)

        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        all_obs.extend(testY)
        all_pred.extend(prediction)
    return rmseEval(all_obs, all_pred)[1]
Пример #5
0
def evalTrainStationTestStation(trainStation, testStation, features):
    trainX, _, trainY, _ = splitDataForXValidation(set([trainStation]), set(), "location", dataByStation[trainStation], features, "target")
    _, testX2, _, testY2 = splitDataForXValidation(set(), set([testStation]), "location", dataByStation[testStation], features, "target")
    model = RandomForestRegressor(max_depth=10, n_estimators = 60, n_jobs = -1, random_state=42)
    model.fit(trainX, trainY)
    prediction = model.predict(testX2)
    rmse = rmseEval(testY2, prediction)[1]
    print("Training on station " + str(trainStation) + ", applying on station " + str(testStation) + ": rmse: " + str(rmse))
    return rmse
Пример #6
0
def evalOne(parameters):
    all_obs = []
    all_pred = []
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, all_features, "target")
        normalizer_X = StandardScaler()
        trainX = normalizer_X.fit_transform(trainX)
        testX = normalizer_X.transform(testX)
        normalizer_Y = StandardScaler()
        trainY = normalizer_Y.fit_transform(trainY)
        testY = normalizer_Y.transform(testY)

        layers = []
        for _ in range(0, parameters["hidden_layers"]):
            layers.append(
                Layer(parameters["hidden_type"],
                      units=parameters["hidden_neurons"]))
        layers.append(Layer("Linear"))
        model = Regressor(layers=layers,
                          learning_rate=parameters["learning_rate"],
                          n_iter=parameters["iteration"],
                          random_state=42)

        X = np.array(trainX)
        y = np.array(trainY)

        model.fit(X, y)

        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        prediction = normalizer_Y.inverse_transform(prediction)
        testY = normalizer_Y.inverse_transform(testY)

        print("location: " + str(location) + " -> " +
              str(rmseEval(prediction, testY)[1]))

        all_obs.extend(testY)
        all_pred.extend(prediction)

    return rmseEval(all_obs, all_pred)[1]
Пример #7
0
def rf(week, timestampWeekCategory, stationNames, ospmData2013, ospmData2014,
       data2013, data2014):

    columns = []
    for c in data2013:
        columns.append(c)

    columns.remove("location")
    columns.remove("timestamp")
    columns.remove("target")

    X = []
    y = []

    for i in range(0, len(data2013["target"])):
        timestamp = str(int(data2013["timestamp"][i]))
        weekC = timestampWeekCategory[timestamp]
        if int(weekC) >= week:
            y.append(data2013["target"][i])
            x = []
            for c in columns:
                x.append(data2013[c][i])
            X.append(x)

    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(X, y)

    #     print(str(len(X)))

    X = []
    y = []

    for i in range(0, len(data2014["target"])):
        y.append(data2014["target"][i])
        x = []
        for c in columns:
            x.append(data2014[c][i])
        X.append(x)

    prediction = model.predict(X)
    rmse = rmseEval(y, prediction)
    return rmse
Пример #8
0
def evalOne(parameters):
    
    all_obs = []
    all_pred = []
#     all_obs_train = []
#     all_pred_train = []

    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target")
        model = KNeighborsRegressor(weights = parameters["weights"], n_neighbors = parameters["neighbors"], p = parameters["p"])
        model.fit(trainX, trainY)
#         train_prediction = model.predict(trainX)
        prediction = model.predict(testX)
        all_obs.extend(testY)
        all_pred.extend(prediction)
#         all_obs_train.extend(trainY)
#         all_pred_train.extend(train_prediction)

    return rmseEval(all_obs, all_pred)[1] 
Пример #9
0
def eval_one(features):

    all_predictions = []
    all_observations = []

    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, features, "target")
        model = RandomForestRegressor(min_samples_leaf=2,
                                      random_state=42,
                                      n_estimators=650,
                                      n_jobs=-1)
        model.fit(trainX, trainY)
        predictions = model.predict(testX)
        all_observations.extend(testY)
        all_predictions.extend(predictions)

    rmse = rmseEval(all_observations, all_predictions)[1]
    log("\tRMSE: " + str(rmse))
Пример #10
0
def evalOne(parameters):
    all_obs = []
    all_pred = []
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target")
        normalizer_X = StandardScaler()
        trainX = normalizer_X.fit_transform(trainX)
        testX = normalizer_X.transform(testX)
        normalizer_Y = StandardScaler()
        trainY = normalizer_Y.fit_transform(trainY)
        testY = normalizer_Y.transform(testY)
        model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=parameters["C"], cache_size=5000), max_samples=parameters["max_samples"],n_estimators=parameters["n_estimators"], verbose=0, n_jobs=-1)
        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        prediction = normalizer_Y.inverse_transform(prediction)
        testY = normalizer_Y.inverse_transform(testY)
        all_obs.extend(testY)
        all_pred.extend(prediction)
        
    return rmseEval(all_obs, all_pred)[1]
Пример #11
0
def evaluateFeatures(vector, features, data):
    featureToUse = []
    for i in range(len(vector)):
        if vector[i] == 1:
            featureToUse.append(features[i])

    combinedRmse = []

    # modelling
    for location in locationValues:

        trainX, testX, trainY, testY = splitDataForXValidation2(
            location, "location", data, featureToUse, "target")

        model = RandomForestRegressor(min_samples_leaf=9,
                                      n_estimators=59,
                                      n_jobs=-1,
                                      random_state=42)

        model.fit(trainX, trainY)

        prediction = model.predict(testX)

        rmse = rmseEval(testY, prediction)

        combinedRmse.append(rmse[1])

    # calculate avg rmse

    avgRmse = 0.0
    for rmse in combinedRmse:
        avgRmse = avgRmse + rmse

    avgRmse = avgRmse / len(combinedRmse)

    return avgRmse
Пример #12
0
allPredictionsTW = []
allPredictionsTWA = []

for location in locations:
    location2s = [l for l in locations if l != location]

    log("Location: " + str(location) + ", location2: " + str(location2s))

    # tw_4stations
    trainX, testX, trainY, testY = splitDataForXValidation(
        location, "location", data, tw_features, "target")
    allObservations.extend(testY)
    model = create_model()
    model.fit(trainX, trainY)
    predictionTW = model.predict(testX)
    rmse = rmseEval(testY, predictionTW)[1]
    log("\tTW:" + str(rmse))
    allPredictionsTW.extend(predictionTW)

    # tw_4stations
    trainX, testX, trainY, testY = splitDataForXValidation(
        location, "location", data, twa_features, "target")
    model = create_model()
    model.fit(trainX, trainY)
    predictionTWA = model.predict(testX)
    rmse = rmseEval(testY, predictionTWA)[1]
    log("\tTWA:" + str(rmse))
    allPredictionsTWA.extend(predictionTWA)

    #combination
    classifier_X = []
Пример #13
0
columns = []
loadData("/data/york_hour_2013.csv", ["timestamp", "atc"], data, columns)

print(str(columns))

all_features = deepcopy(columns)
all_features.remove("target")
all_features.remove("location")

# remove to decrease rmse from 10000000.0... 
all_features.remove('buildings_area')
all_features.remove('leisure_area')

all_obs = []
all_pred = []

for location in locations:
    print("Location: " + str(location))
    trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target")
    model = linear_model.LinearRegression(True, True, True, -1)
    model.fit(trainX, trainY)
    prediction = model.predict(testX)
    rmse = str(rmseEval(testY, prediction)[1])
    print("\tRmse:" + rmse)
    all_obs.extend(testY)
    all_pred.extend(prediction)

print("Overall:")
rmse = str(rmseEval(all_obs, all_pred)[1])
print("Rmse:" + rmse)
Пример #14
0
def eval_one(step):

    if step in cached_results:
        return cached_results[step]

    eval_features = []
    for i in range(0, len(all_features)):
        if step[i]:
            eval_features.append(all_features[i])

    all_observations = []
    all_pred_combined = []

    for group in range(0, len(groups)):

        train_stations, test_stations = generate_train_test_station_list(
            group, groups)
        train_station_set = set([float(s) for s in train_stations])
        test_station_set = set([float(s) for s in test_stations])

        train_lower = [
            float(train_stations[i]) for i in range(0, len(train_stations))
            if i < (len(train_stations) / 2.0)
        ]
        train_lower_set = set(train_lower)
        train_upper = [
            float(train_stations[i]) for i in range(0, len(train_stations))
            if i >= (len(train_stations) / 2.0)
        ]
        train_upper_set = set(train_upper)
        test_lower = [
            float(test_stations[i]) for i in range(0, len(test_stations))
            if i < (len(test_stations) / 2.0)
        ]

        # tw_lower
        trainX, testX, trainY, testY = splitDataForXValidation(
            train_lower_set, test_station_set, "location", data, tw_features,
            "target")
        model = create_model()
        model.fit(trainX, trainY)
        prediction_lower = model.predict(testX)

        # tw_upper
        trainX, testX, trainY, testY = splitDataForXValidation(
            train_upper_set, test_station_set, "location", data, tw_features,
            "target")
        model = create_model()
        model.fit(trainX, trainY)
        prediction_upper = model.predict(testX)

        trainX, testX, trainY, testY, train_location, test_location = splitDataForXValidationWithLocation(
            train_station_set, test_station_set, "location", data,
            eval_features, "target")
        train_label = generate_label(train_location, train_lower)
        test_label = generate_label(test_location, test_lower)

        model = create_classifier_model()
        model.fit(trainX, train_label)
        prediction_label = model.predict(testX)

        pred_combined = generate_combined_prediction(prediction_label,
                                                     prediction_lower,
                                                     prediction_upper)
        all_pred_combined.extend(pred_combined)
        all_observations.extend(testY)

    rmse = rmseEval(all_observations, all_pred_combined)[1]

    cached_results[step] = rmse

    # save down the cached result

    cache_output = open(CACHE_FILE, "a")
    step_list = [str(s) for s in step]
    step_str = ",".join(step_list)
    cache_output.write(str(rmse) + ";" + step_str + "\n")
    cache_output.close()

    return rmse
Пример #15
0
    log("\ttrainStations: " + str(trainStations))
    log("\ttestStations: " + str(testStations))

    train_station_set = set([float(s) for s in trainStations])
    test_station_set = set([float(s) for s in testStations])

    trainX, testX, trainY, testY = splitDataForXValidation(
        train_station_set, test_station_set, "location", data, tw_features,
        "target")
    model = RandomForestRegressor(min_samples_leaf=29,
                                  n_estimators=64,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    prediction_TW = model.predict(testX)
    rmse = rmseEval(testY, prediction_TW)[1]
    log("\tTW rmse: " + str(rmse))
    all_observations.extend(testY)
    all_pred_TW.extend(prediction_TW)

    trainX, testX, trainY, testY = splitDataForXValidation(
        train_station_set, test_station_set, "location", data, twa_features,
        "target")
    model = RandomForestRegressor(min_samples_leaf=29,
                                  n_estimators=64,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    prediction_TWA = model.predict(testX)
    rmse = rmseEval(testY, prediction_TWA)[1]
    log("\tTWA rmse: " + str(rmse))
Пример #16
0
errorsTimestamps = {}
eData = {}

for m in models:

    errors = {}
    for l in locations:
        errors[l] = 0

    errorsTimestamps[m] = set()
    records = len(data[m]['target'])

    print("Overall:")

    print("\t" + "#records: " + str(records))
    rmse = rmseEval(data[m]['target'], data[m]['prediction'])[1]
    print("\t" + "rmse: " + str(rmse))

    absoluteError = ae(data[m]['target'], data[m]['prediction'])
    absoluteError.sort()

    eData[m] = absoluteError

    # error without records have ae > 20.0

    data2 = {}
    for c in columns[m]:
        data2[c] = []
    for i in range(0, records):
        if abs(data[m]['target'][i] - data[m]['prediction'][i]) < 20.0:
            for c in columns[m]:
Пример #17
0
    trainStationList = [s for s in all_stations if float(s) in trainStations]
    log(output_log, "\ttrainStationList:" + str(trainStationList))

    testStations = set(float(station) for station in testStationList)

    trainX, testX, trainY, testY = splitDataForXValidation(
        trainStations, testStations, "location", data, features_TW, "target")
    log(output_log,
        "\tTW #train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    prediction_TW = model.predict(testX)
    rmse = rmseEval(testY, prediction_TW)[1]
    log(output_log, "\trmse: " + str(rmse))
    obs.extend(testY)
    all_pred_TW.extend(prediction_TW)

    trainX, testX, trainY, testY, trainLocation, testLocation = splitDataForXValidationWithLocation(
        trainStations, testStations, "location", data, columns, "target")

    train_lower = [
        float(trainStationList[i]) for i in range(0, len(trainStationList))
        if i < (len(trainStationList) / 2.0)
    ]
    train_upper = [
        float(trainStationList[i]) for i in range(0, len(trainStationList))
        if i >= (len(trainStationList) / 2.0)
    ]
Пример #18
0
        predictionsNormal[method].append(p)
        observationsNormal[method].append(o)

rmseLevels = {}
maeLevels = {}
rLevels = {}
fac2Levels = {}
nmseLevels = {}
fbLevels = {}
rsLevels = {}
mgLevels = {}
vgLevels = {}

for method in methods:
    print("Method: " + method)
    rmse = rmseEval(observations[method], predictions[method])[1]
    print("\trmse: " + str(rmse))
    mae = maeEval(observations[method], predictions[method])[1]
    print("\tmae: " + str(mae))
    r = correlationEval(observations[method], predictions[method])[1]
    print("\tr: " + str(r))
    print("\tr2: " +
          str(rsquaredEval(observations[method], predictions[method])[1]))
    print("\tr2: " + str(r2_score(observations[method], predictions[method])))
    fac2 = fac2Eval(observations[method], predictions[method])
    print("\tfac2: " + str(fac2))
    print("\tmg: " + str(mgEval(observations[method], predictions[method])))
    nmse = nmse_from_paper(observations[method], predictions[method])
    print("\tnmse: " + str(nmse))
    fb = fbEval(observations[method], predictions[method])[1]
    print("\tfb: " + str(fb))
Пример #19
0
def doEval(landuse, topo, traffic_static, traffic_dynamic, weather, time,
           output):

    if landuse == False and topo == False and traffic_dynamic == False and traffic_static == False and weather == False and time == False:
        return

    groupName = "lu"
    if landuse == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "to"
    if topo == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "ts"
    if traffic_static == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "td"
    if traffic_dynamic == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "we"
    if weather == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "ti"
    if time == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    print("Group: " + groupName)

    features = []

    if landuse:
        features.append('leisure_area')
        features.append('landuse_area')
    if topo:
        features.append('buildings_number')
        features.append('buildings_area')
    if traffic_static:
        features.append('lane_length')
        features.append('length')
    if traffic_dynamic:
        features.append('traffic_length_car')
        features.append('traffic_length_lgv')
        features.append('traffic_length_hgv')
    if weather:
        features.append('winddirection')
        features.append('windspeed')
        features.append('temperature')
        features.append('rain')
        features.append('pressure')
    if time:
        features.append('hour')
        features.append('day_of_week')
        features.append('month')
        features.append('bank_holiday')
        features.append('race_day')

    all_obs = []
    all_prediction = []

    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, features, "target")
        model = RandomForestRegressor(min_samples_leaf=2,
                                      random_state=42,
                                      n_estimators=650,
                                      n_jobs=-1)
        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        all_obs.extend(testY)
        all_prediction.extend(prediction)

    rmse = rmseEval(all_obs, all_prediction)[1]
    output.write(str(groupName) + "," + str(rmse) + "\n")
    output.flush()
Пример #20
0
def doEval(landuse, topo, traffic_static, traffic_dynamic, weather, time,
           output):

    if landuse == False and topo == False and traffic_dynamic == False and traffic_static == False and weather == False and time == False:
        return

    groupName = "lu"
    if landuse == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "to"
    if topo == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "ts"
    if traffic_static == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "td"
    if traffic_dynamic == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "we"
    if weather == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "ti"
    if time == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    print("Group: " + groupName)

    columnsToSkip = ['timestamp']

    if landuse == False:
        columnsToSkip.append('leisure_area')
        columnsToSkip.append('landuse_area')
    if topo == False:
        columnsToSkip.append('buildings_number')
        columnsToSkip.append('buildings_area')
    if traffic_static == False:
        columnsToSkip.append('lane_length')
        columnsToSkip.append('length')
    if traffic_dynamic == False:
        columnsToSkip.append('atc')
    if weather == False:
        columnsToSkip.append('winddirection')
        columnsToSkip.append('windspeed')
        columnsToSkip.append('temperature')
        columnsToSkip.append('rain')
        columnsToSkip.append('pressure')
    if time == False:
        columnsToSkip.append('hour')
        columnsToSkip.append('day_of_week')
        columnsToSkip.append('month')
        columnsToSkip.append('bank_holiday')
        columnsToSkip.append('race_day')

    columns = []
    data = {}
    loadData(DATA_FILE, columnsToSkip, data, columns)

    # modelling
    for location in locations:

        print("Location: " + str(location))

        trainX, testX, trainY, testY = splitDataForXValidation1(
            location, "location", data, columns, "target")
        print("\tRFR #train: " + str(len(trainY)) + ", #test:" +
              str(len(testY)))
        model = RandomForestRegressor(min_samples_leaf=9,
                                      n_estimators=59,
                                      n_jobs=-1,
                                      random_state=42)
        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        rmse = rmseEval(testY, prediction)[1]
        print("\trmse: " + str(rmse))
        output.write(str(groupName) + "," + str(rmse) + "\n")
Пример #21
0
def evalColumns(columns):

#     log("Evaluating " + str([all_columns[i] for i in range(0, len(all_columns)) if columns[i]]))
    
    overallY = []
    overallPred = []

    for location in locations:
                    
        trainX = loadX(INPUT_DIRECTORY + "z_" + str(int(location)) + "_trainX.csv", all_features)
        trainY = loadSingleColumnsFile(INPUT_DIRECTORY + "z_" + str(int(location)) + "_trainY.csv")
        trainPreds = []
        for tag in top4tags:
            p = loadSingleColumnsFile(INPUT_DIRECTORY + "z_" + str(int(location)) + "_trainPred_" + tag + ".csv")
            for i in range(0, len(p)):
                trainX[i].append(p[i])
            trainPreds.append(p)
        labelY = []
        for i in range(0, len(trainY)):
            bestAbs = abs(trainY[i] - trainPreds[0][i])
            bestIndex = 0
            for j in range(0, len(top4tags)):
                modelAbs = abs(trainY[i] - trainPreds[j][i])
                if modelAbs < bestAbs:
                    bestAbs = modelAbs
                    bestIndex = j
            labelY.append(bestIndex)
        
        # reduce trainX
        
        reducedTrainX = []
        for d in trainX:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTrainX.append(reducedD)
        
        model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15)
        model.fit(reducedTrainX, labelY)
                
        testX = loadX(INPUT_DIRECTORY + "z_" + str(int(location)) + "_testX.csv", all_features)
        testY = loadSingleColumnsFile(INPUT_DIRECTORY + "z_" + str(int(location)) + "_testY.csv")
        testPreds = []
        
        for tag in top4tags:
            p = loadSingleColumnsFile(INPUT_DIRECTORY + "z_" + str(int(location)) + "_testPred_" + tag + ".csv")
            for i in range(0, len(p)):
                testX[i].append(p[i])
            testPreds.append(p)
            
        reducedTestX = []
        for d in testX:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTestX.append(reducedD)
            
        
        testPredY = model.predict(reducedTestX)
    
        prediction = []
        for i in range(0, len(testPredY)):
            p = testPreds[testPredY[i]][i]
            prediction.append(p)        
                
        overallY = overallY + testY
        overallPred = overallPred + prediction
    
    rmse = rmseEval(overallPred, overallY)[1]
    return rmse
Пример #22
0
    print("\ttrainStationList:" + str(trainStationList))
    trainStationList = [s for s in all_stations if float(s) in trainStations]
    print("\ttrainStationList:" + str(trainStationList))

    testStations = set(float(station) for station in testStationList)

    trainX, testX, trainY, testY = splitDataForXValidation(
        trainStations, testStations, "location", data, features_TW, "target")
    print("\tTW #train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    prediction = model.predict(testX)
    rmse = rmseEval(testY, prediction)[1]
    print("\trmse: " + str(rmse))
    #
    #     trainX, testX, trainY, testY = splitDataForXValidation(trainStations, testStations, "location", data, features_TWA, "target")
    #     print("\tTWA #train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    #     model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)
    #     model.fit(trainX, trainY)
    #     prediction = model.predict(testX)
    #     rmse = rmseEval(testY, prediction)[1]
    #     print("\trmse: " + str(rmse))
    #
    #     trainX, testX, trainY, testY = splitDataForXValidation(trainStations, testStations, "location", data, features_ALL, "target")
    #     print("\tALL #train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    #     model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)
    #     model.fit(trainX, trainY)
    #     prediction = model.predict(testX)
def eval_one(step):
        
    eval_features = []
    for i in range(0, len(all_features)):
        if step[i]:
            eval_features.append(all_features[i])
            
    all_observations = []
    all_pred_combined = []
    
    all_label = []
    all_pred_label = []
    
    for group in range(0, len(groups)):
        train_stations, test_stations = generate_train_test_station_list(group, groups)
        train_station_set = set([float(s) for s in train_stations])
        test_station_set = set([float(s) for s in test_stations])
          
        trainX, testX, trainY, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, tw_features, "target")
        model = RandomForestRegressor(min_samples_leaf = 29, n_estimators = 64, n_jobs = -1, random_state=42)
        model.fit(trainX, trainY)
        prediction_TW = model.predict(testX)
        rmse = rmseEval(testY, prediction_TW)[1]
        all_observations.extend(testY)
           
        trainX, testX, trainY, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, twa_features, "target")
        model = RandomForestRegressor(min_samples_leaf = 29, n_estimators = 64, n_jobs = -1, random_state=42)
        model.fit(trainX, trainY)
        prediction_TWA = model.predict(testX)
        rmse = rmseEval(testY, prediction_TWA)[1]
        
        group2s = [groups[i] for i in range(0, len(groups)) if i != group]
        
        #combination
        classifier_X = []
        classifier_Y = []
        for group2 in range(0, len(group2s)):
            train_stations, test_stations = generate_train_test_station_list(group2, group2s)
            train_station_set = set([float(s) for s in train_stations])
            test_station_set = set([float(s) for s in test_stations])
            
            trainX, testX, trainY, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, tw_features, "target")
            model = RandomForestRegressor(min_samples_leaf = 29, n_estimators = 64, n_jobs = -1, random_state=42)
            model.fit(trainX, trainY)
            prediction_3groups_TW = model.predict(testX)
               
            trainX, testX, trainY, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, twa_features, "target")
            model = RandomForestRegressor(min_samples_leaf = 29, n_estimators = 64, n_jobs = -1, random_state=42)
            model.fit(trainX, trainY)
            prediction_3groups_TWA = model.predict(testX)
            
            trainX, testX, trainY, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, eval_features, "target")
            classifier_X.extend(testX)
            label = generate_label(testY, prediction_3groups_TW, prediction_3groups_TWA)
            classifier_Y.extend(label)
    
        train_stations, test_stations = generate_train_test_station_list(group, groups)
        train_station_set = set([float(s) for s in train_stations])
        test_station_set = set([float(s) for s in test_stations])
         
        model = create_classifier_model()
        model.fit(classifier_X, classifier_Y)
        _, testX, _, testY = splitDataForXValidation(train_station_set, test_station_set, "location", data, eval_features, "target")
    
        classifier_prediction = model.predict(testX)
        test_label = generate_label(testY, prediction_TW, prediction_TWA)
        all_label.extend(test_label)
        all_pred_label.extend(classifier_prediction)
        combined_prediction = generate_combined_prediction(classifier_prediction, prediction_TW, prediction_TWA)
        rmse = rmseEval(testY, combined_prediction)[1]
        all_pred_combined.extend(combined_prediction)    
        
    rmse = rmseEval(all_observations, all_pred_combined)[1]
    accuracy = calculate_accuracy(all_label, all_pred_label)
        
    return rmse, accuracy
Пример #24
0
def evalColumns(columns):

    overallY = []
    overallPred = []

    for location in locations:
        location2s = [l for l in locations if l != location]
        
        print("Location: " + str(location) + ", location2: " + str(location2s))
        
        # generating testPreds
        testPreds = {}
        for datagroup in topDatagroups:
            tag, features = getTagAndFeatures(datagroup)
            trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, features, "target")
                
            model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)                    
            model.fit(trainX, trainY)
            prediction = model.predict(testX)
            testPreds[tag] = prediction
          
        trainPreds = defaultdict(list)
          
        for datagroup in topDatagroups:
            tag, features = getTagAndFeatures(datagroup)
            print("\ttag: " + str(tag) + ", features: " + str(features))
            for location2 in location2s:
                trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, features, "target")
                model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)                    
                model.fit(trainX1, trainY1)
                train1Prediction = model.predict(trainX1)
                train2Prediction = model.predict(trainX2)
                testPrediction = model.predict(testX)
                train1Rmse = str(rmseEval(trainY1, train1Prediction)[1])
                train2Rmse = str(rmseEval(trainY2, train2Prediction)[1])
                testRmse = str(rmseEval(testY, testPrediction)[1])
                print("\t\ttrain1 rmse: " + train1Rmse)
                print("\t\ttrain2 rmse: " + train2Rmse)
                print("\t\ttest rmse: " + testRmse)
                for x in train2Prediction:
                    trainPreds[tag].append(x)

        # get combined train2y                
        combinedTrain2Y = []        
        for location2 in location2s:
            trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target")
            combinedTrain2Y = combinedTrain2Y + trainY2
          
        # calculate labels 
        labelTrain2Y = []
        for i in range(0, len(combinedTrain2Y)):
            bestModel = 0
            bestAbs = abs(combinedTrain2Y[i] - trainPreds[topTags[0]][i])
            for j in range(0, len(topTags)):
                tag = topTags[j]
                modelAbs = abs(combinedTrain2Y[i] - trainPreds[tag][i])
                if modelAbs < bestAbs:
                    bestAbs = modelAbs
                    bestModel = j
            labelTrain2Y.append(bestModel)
            
        # generating testX
        _, testX, _, _ = splitDataForXValidation(location, "location", data, all_features, "target")

        # trainX2             
        tX2 = []
        for location2 in location2s:
            _, trainX2, _, _, _, _ = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target")
            for row in trainX2:
                tX2.append(row)
        
        for tag in topTags:
            for i in range(0, len(trainPreds[tag])):
                tX2[i].append(trainPreds[tag][i]) 
        
        reducedTrainX2 = []
        for d in tX2:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTrainX2.append(reducedD)
              
        model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15)
        model.fit(reducedTrainX2, labelTrain2Y)
        
        for tag in topTags:
            for i in range(0, len(testPreds[tag])):
                testX[i].append(testPreds[tag][i]) 
        
        reducedTestX = []
        for d in testX:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTestX.append(reducedD)
         
        pred = model.predict(reducedTestX)
         
        finalPrediction = []
        for i in range(0, len(testY)):
            p = testPreds[topTags[pred[i]]][i]
            finalPrediction.append(p)      
        rmse = str(rmseEval(testY, finalPrediction)[1])
        print("\tRMSE: " + str(rmse))
        
        for x in testY:
            overallY.append(x)
        for x in finalPrediction:
            overallPred.append(x)
    
    rmse = rmseEval(overallPred, overallY)[1]
    return rmse
Пример #25
0
                testY[i] -
                predData[tag][str(location)][str(int(testTimestamp[i]))])
            if tagAbs < bestAbs:
                bestModel = tag
                bestAbs = tagAbs

        locationBestCounter[bestModel] = locationBestCounter[bestModel] + 1

        twPred = predData["TW"][str(location)][str(int(testTimestamp[i]))]
        twPredictions.append(twPred)

        bestPred = predData[bestModel][str(location)][str(int(
            testTimestamp[i]))]
        bestPredictions.append(bestPred)

    # print(str(locationBestCounter))
    rmse = rmseEval(testY, twPredictions)[1]
    print("\tTW rmse: " + str(rmse))
    rmse = rmseEval(testY, bestPredictions)[1]
    print("\tBest rmse: " + str(rmse))
    for tag in tags:
        bestCounter[tag] = bestCounter[tag] + locationBestCounter[tag]

print("BestCounter:")
orderedBestCounter = []
for tag in tags:
    orderedBestCounter.append((bestCounter[tag], tag))
orderedBestCounter.sort(reverse=True)
for t in orderedBestCounter:
    print("\t" + t[1] + ": " + str(t[0]))
Пример #26
0
def eval_one(step):

    eval_features = []
    for i in range(0, len(all_features)):
        if step[i]:
            eval_features.append(all_features[i])

    all_observations = []
    all_pred_combined = []
    Y = []
    P = []

    for group in range(0, len(groups)):

        train_stations, test_stations = generate_train_test_station_list(
            group, groups)
        train_station_set = set([float(s) for s in train_stations])
        test_station_set = set([float(s) for s in test_stations])

        train_lower = [
            float(train_stations[i]) for i in range(0, len(train_stations))
            if i < (len(train_stations) / 2.0)
        ]
        train_lower_set = set(train_lower)
        train_upper = [
            float(train_stations[i]) for i in range(0, len(train_stations))
            if i >= (len(train_stations) / 2.0)
        ]
        train_upper_set = set(train_upper)
        test_lower = [
            float(test_stations[i]) for i in range(0, len(test_stations))
            if i < (len(test_stations) / 2.0)
        ]

        # tw_lower
        trainX, testX, trainY, testY = splitDataForXValidation(
            train_lower_set, test_station_set, "location", data, tw_features,
            "target")
        model = create_model()
        model.fit(trainX, trainY)
        prediction_lower = model.predict(testX)

        # tw_upper
        trainX, testX, trainY, testY = splitDataForXValidation(
            train_upper_set, test_station_set, "location", data, tw_features,
            "target")
        model = create_model()
        model.fit(trainX, trainY)
        prediction_upper = model.predict(testX)

        trainX, testX, trainY, testY, train_location, test_location = splitDataForXValidationWithLocation(
            train_station_set, test_station_set, "location", data,
            eval_features, "target")
        train_label = generate_label(train_location, train_lower)
        test_label = generate_label(test_location, test_lower)

        model = create_classifier_model()
        model.fit(trainX, train_label)
        prediction_label = model.predict(testX)

        pred_combined = generate_combined_prediction(prediction_label,
                                                     prediction_lower,
                                                     prediction_upper)
        all_pred_combined.extend(pred_combined)
        all_observations.extend(testY)
        Y.extend(test_label)
        P.extend(prediction_label)

    rmse = rmseEval(all_observations, all_pred_combined)[1]
    accuracy = accuracy_score(Y, P)

    return rmse, accuracy
Пример #27
0
    timestampData2.append(str(int(v)))

# modelling
for location in locations:

    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data, featureTW, "target", timestampData)
    print("\tT+W (on data without ATC) #train: " + str(len(trainY)) +
          ", #test:" + str(len(testY)))
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    prediction = model.predict(testX)
    rmse = rmseEval(testY, prediction)[1]
    print("\trmse: " + str(rmse))
    for i in range(0, len(testY)):
        timestamp = testTimestamp[i]
        value = prediction[i]
        TWpredictionData[str(location)][timestamp] = value

    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data2, featureTWAtc, "target", timestampData2)
    print("\tT+W+Atc #train: " + str(len(trainY)) + ", #test:" +
          str(len(testY)))
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
Пример #28
0
    'winddirection', 'windspeed', 'temperature', 'rain', 'pressure'
]

for location in locations:
    print("location: " + str(location))
    # save down trainX, trainY, testX, testY
    trainX, testX, trainY, testY = splitDataForXValidation(
        location, "location", data, columns, "target")
    print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    testPrediction = model.predict(testX)
    testRmse = str(rmseEval(testY, testPrediction)[1])
    print("\tRFR+All rmse: " + str(testRmse))

    trainX, testX, trainY, testY = splitDataForXValidation(
        location, "location", data, columnsTW, "target")
    print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    testPrediction = model.predict(testX)
    testRmse = str(rmseEval(testY, testPrediction)[1])
    print("\tRFR+TW rmse: " + str(testRmse))

    for sr in [0.95, 0.9, 0.85, 0.8, 0.75, 0.7]:
Пример #29
0
                 all_features, trainX2)
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testX.csv",
                 all_features, testX)
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainY.csv",
                 ["target"], trainY2)
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testY.csv",
                 ["target"], testY)

    for dataGroup in generateAllDataGroups():
        tag, features = getTagAndFeatures(dataGroup)
        trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled(
            location, "location", sampleRate, 42, data, features, "target")
        model = RandomForestRegressor(min_samples_leaf=9,
                                      n_estimators=59,
                                      n_jobs=-1,
                                      random_state=42)
        model.fit(trainX1, trainY1)
        trainPrediction = model.predict(trainX2)
        testPrediction = model.predict(testX)
        trainRmse = str(rmseEval(trainY2, trainPrediction)[1])
        testRmse = str(rmseEval(testY, testPrediction)[1])
        print("\t" + tag + ": #train: " + str(len(trainY2)) + ", #test:" +
              str(len(testY)) + ", trainRMSE: " + trainRmse + ", testRMSE: " +
              testRmse)
        writeOutData(
            OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainPred_" +
            tag + ".csv", ["trainPred_" + tag], trainPrediction)
        writeOutData(
            OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testPred_" + tag +
            ".csv", ["testPred_" + tag], testPrediction)
Пример #30
0
output.write(
    "location,timestamp,obs,pred_TW,pred_TWA,pred_combined,combined_uses_tw_twa\n"
)

output_log = open(OUTPUT_LOG_FILE, 'w')

for location in locations:
    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data, tw_features, "target", timestampData)
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    testPredictionTW = model.predict(testX)
    rmse = str(rmseEval(testY, testPredictionTW)[1])
    log(output_log, "\tTW rmse: " + rmse)
    for x in testY:
        allObs.append(x)
    for x in testPredictionTW:
        allPredictionTW.append(x)
    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data, twa_features, "target", timestampData)
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    testPredictionTWA = model.predict(testX)
    rmse = str(rmseEval(testY, testPredictionTWA)[1])
    log(output_log, "\tTWA rmse: " + rmse)