Пример #1
0
def plot_tuning_results(mses_train, ml_name, input_path, scaler, factor):
    """
    Plot results of tuning window size parameter
    :param mses_train: numpy array of train MSEs
    :param ml_name: machine learning model name
    :param input_path: tuning directory
    :param scaler: scaler string
    :param factor: stable number to factor the matrix
    :return: plot displayed
    """

    c2 = "blue"
    c1 = "lightblue"
    p_color = "black"
    box_plot = plt.boxplot(np.transpose(mses_train) * factor,
                           patch_artist=True)
    for item in ['boxes', 'whiskers', 'fliers', 'medians', 'caps']:
        plt.setp(box_plot[item], color=p_color)
    plt.setp(box_plot["boxes"], facecolor=c1)
    plt.setp(box_plot["fliers"], markeredgecolor=c2)
    # plt.boxplot(np.log(np.transpose(mses_train)))
    # plt.yscale('log')
    plt.title(
        "Anomaly prediction over the simulator data set - {0} Model".format(
            ml_name))
    plt.ylabel(
        "Testing log(MSE)s of the records = Actual MSE * {0}".format(factor))
    plt.xlabel("Setting of n_prev")

    min_value = 0.7 * np.amin(mses_train * factor)
    max_value = 1.3 * np.amax(mses_train * factor)

    plt.gcf().set_size_inches(12, 9)
    plt.gca().set_ylim([min_value, max_value])
    # plt.show()

    ml_directory_route = os.path.join(input_path, ml_name)
    create_directories(ml_directory_route)
    plot_directory_route = os.path.join(ml_directory_route, scaler)
    create_directories(plot_directory_route)
    current_time = get_current_time()
    plt_path = os.path.join(*[
        str(plot_directory_route),
        str(ml_name) + '_' + str(scaler) + '_' + str(current_time) + '.png'
    ])
    plt.savefig(f"{plt_path}")
    plt.clf()
Пример #2
0
def model_tuning(file_path, input_features, target_features, window_size,
                 scaler, results_path, model_name):
    """
    model's tuning process by using GridSearchCV
    :param file_path: data file path
    :param input_features: the list of features which the user chose for the train
    :param target_features: the list of features which the user chose for the test
    :param window_size: window size variable
    :param scaler: scaler name
    :param results_path: results path
    :param model_name: model name
    :return: model name , best models params
    """

    df_train = pd.read_csv(f'{file_path}')

    input_df_train = df_train[input_features]
    target_df_train = df_train[target_features]

    # Step 1 : Clean train data set
    input_df_train = clean_data(input_df_train)

    target_df_train = clean_data(target_df_train)

    # Step 2: Normalize the data

    X = normalize_data(data=input_df_train, scaler=scaler)[0]

    Y = normalize_data(data=target_df_train, scaler=scaler)[0]

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

    model = get_model(model_name)

    model_grid_params = get_model_params(model_name)

    tsr = TimeSeriesRegressor(model, n_prev=window_size)

    grid_search = GridSearchCV(tsr, model_grid_params)

    grid_search.fit(X_train, Y_train)
    prediction = grid_search.predict(X_test)

    plot_title = "Optimized Time Series " + model_name + " model"
    print(str(model_name) + " " + str(grid_search.best_params_))
    current_time = get_current_time()
    file_name = str(current_time) + "-" + str(model_name) + "-model_data.json"
    data = {}
    data['model'] = model_name
    data["input_features"] = input_features
    data["target_features"] = target_features
    data['params'] = grid_search.best_params_
    data['score'] = grid_search.best_score_

    file_path = os.path.join(str(results_path), str(file_name))
    with open(f"{file_path}", 'w') as outfile:
        json.dump(data, outfile)

    # Y_test_preprocessed = tsr._preprocess(X_test, Y_test)[1]
    #
    # for i, target_feature in enumerate(target_features):
    #     title = "Grid search test performance of " + model_name + " for window size: " + \
    #             str(window_size) + " and " + target_feature + " feature"
    #     plot_prediction_performance(Y_train=Y_test_preprocessed[:, i],
    #                                 X_pred=prediction[:, i],
    #                                 results_path=results_path,
    #                                 title=title)

    return data['params'], data['score']
def run_model(training_data_path, test_data_path, results_path,
              similarity_score, save_model, new_model_running, algorithm_path,
              threshold, features_list, target_features_list,
              train_scaler_path, target_scaler_path, event):
    """
    Run SVR model process
    :param training_data_path: train data set directory path
    :param test_data_path: test data set directory path
    :param results_path: results directory path
    :param similarity_score: chosen similarity functions
    :param save_model: indicator whether the user want to save the model or not
    :param new_model_running: indicator whether we are in new model creation flow or not
    :param algorithm_path: path of existing algorithm
    :param threshold: saved threshold for load model flow
    :param features_list:  saved chosen features for load model flow
    :param target_features_list: all the features in the test data set for the target
    :param train_scaler_path: path of existing input train scaler directory
    :param target_scaler_path: path of existing input target scaler directory
    :param event: running state flag
    :return:  reported results for SVR execution
    """

    # Choose between new model creation flow and load existing model flow
    if new_model_running:
        kernel, gamma, epsilon, threshold, window_size = get_svr_new_model_parameters(
        )
    else:
        svr_model = pickle.load(open(algorithm_path, 'rb'))
        X_train_scaler = pickle.load(open(train_scaler_path, 'rb'))
        Y_train_scaler = pickle.load(open(target_scaler_path, 'rb'))
        window_size = svr_model.n_prev
        X_train = None
        Y_train = None

    FLIGHT_ROUTES = get_subdirectories(test_data_path)

    current_time = get_current_time()

    current_time_path = os.path.join(
        *[str(results_path), 'svr',
          str(current_time)])
    create_directories(f"{current_time_path}")

    # Create sub directories for each similarity function
    for similarity in similarity_score:
        similarity_path = os.path.join(
            *[str(current_time_path), str(similarity)])
        create_directories(f"{similarity_path}")

    # Train the model for each flight route
    for flight_route in FLIGHT_ROUTES:

        # Execute training for new model flow
        if new_model_running:
            svr_model, X_train_scaler, Y_train_scaler, X_train, Y_train = execute_train(
                flight_route,
                training_data_path=training_data_path,
                kernel=kernel,
                gamma=gamma,
                epsilon=epsilon,
                features_list=features_list,
                window_size=window_size,
                target_features_list=target_features_list,
                event=event)

        # Get results for each similarity function
        for similarity in similarity_score:
            current_results_path = os.path.join(
                *[str(current_time_path),
                  str(similarity),
                  str(flight_route)])
            create_directories(f"{current_results_path}")
            tpr_scores, fpr_scores, acc_scores, delay_scores, routes_duration, attacks_duration = execute_predict(
                flight_route,
                test_data_path=test_data_path,
                similarity_score=similarity,
                threshold=threshold,
                svr_model=svr_model,
                X_train_scaler=X_train_scaler,
                results_path=current_results_path,
                add_plots=True,
                run_new_model=new_model_running,
                X_train=X_train,
                features_list=features_list,
                target_features_list=target_features_list,
                save_model=save_model,
                Y_train_scaler=Y_train_scaler,
                Y_train=Y_train,
                window_size=window_size,
                event=event)

            df = pd.DataFrame(tpr_scores)
            tpr_path = os.path.join(
                *[str(current_results_path),
                  str(flight_route) + '_tpr.csv'])
            df.to_csv(f"{tpr_path}", index=False)

            df = pd.DataFrame(fpr_scores)
            fpr_path = os.path.join(
                *[str(current_results_path),
                  str(flight_route) + '_fpr.csv'])
            df.to_csv(f"{fpr_path}", index=False)

            df = pd.DataFrame(acc_scores)
            acc_path = os.path.join(
                *[str(current_results_path),
                  str(flight_route) + '_acc.csv'])
            df.to_csv(f"{acc_path}", index=False)

            df = pd.DataFrame(delay_scores)
            delay_path = os.path.join(
                *[str(current_results_path),
                  str(flight_route) + '_delay.csv'])
            df.to_csv(f"{delay_path}", index=False)

    algorithm_name = "SVR"

    # Report results for training data to csv files
    for similarity in similarity_score:
        report_similarity_path = os.path.join(
            *[str(results_path), 'svr',
              str(current_time),
              str(similarity)])
        report_results(f"{report_similarity_path}", test_data_path,
                       FLIGHT_ROUTES, algorithm_name, similarity,
                       routes_duration, attacks_duration)
def model_tuning(file_path, input_features, target_features, window_size,
                 scaler, results_path):
    """
    model's tuning process by using GridSearchCV
    :param model_name: model name
    :param file_path: data file  path
    :param input_features: the list of features which the user chose for the train
    :param target_features: the list of features which the user chose for the test
    :param window_size: window size variable
    :param scaler: scaler name
    :param results_path: results path
    :return: model name , best models params
    """

    df_train = pd.read_csv(f'{file_path}')

    input_df_train = df_train[input_features]
    target_df_train = df_train[target_features]

    X = normalize_data(data=input_df_train, scaler=scaler)[0]

    Y = normalize_data(data=target_df_train, scaler=scaler)[0]

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

    assert len(X_train) == len(Y_train)
    assert len(X_test) == len(Y_test)

    X_train_preprocessed = get_training_data_lstm(X_train, window_size)
    X_test_preprocessed = get_training_data_lstm(X_test, window_size)

    Y_train_preprocessed = get_training_data_lstm(Y_train, window_size)
    Y_test_preprocessed = get_training_data_lstm(Y_test, window_size)

    params_configurations = get_lstm_params_configurations()

    total_scores = dict()

    for config in params_configurations:
        encoding_dimension, activation, loss, optimizer, epochs = config

        lstm_model = get_lstm_autoencoder_model(
            timesteps=window_size,
            input_features=input_df_train.shape[1],
            target_features=target_df_train.shape[1],
            encoding_dimension=encoding_dimension,
            activation=activation,
            loss=loss,
            optimizer=optimizer)
        lstm_model.fit(X_train_preprocessed,
                       Y_train_preprocessed,
                       epochs=epochs,
                       verbose=0)

        X_test_pred = lstm_model.predict(X_test_preprocessed)

        scores = []
        for i, pred in enumerate(X_test_pred):
            scores.append(
                anomaly_score_multi(Y_test_preprocessed[i], pred, 'MSE'))

        total_scores[str(config)] = mean(scores)

    total_sorted = {
        k: v
        for k, v in sorted(total_scores.items(), key=lambda item: item[1])
    }

    best_config = list(total_sorted.items())[0][0]
    best_score = list(total_sorted.items())[0][1]
    print(best_config)
    print(best_score)

    current_time = get_current_time()
    file_name = str(current_time) + "-LSTM-model_data.json"
    data = {}
    data['model'] = 'LSTM'
    data["input_features"] = input_features
    data["target_features"] = target_features
    data["window_size"] = window_size
    data['params'] = best_config
    data['score'] = best_score

    file_path = os.path.join(*[str(results_path), str(file_name)])
    with open(f"{file_path}", 'w') as outfile:
        json.dump(data, outfile)

    return data['params'], data['score']