示例#1
0
def support_vector(x_train, x_test, y_train, y_test):
    '''
    support vector try
    '''
    x_train, x_test, y_train, y_test, scalar_x, scalar_y = scale_data(
        x_train, x_test, y_train, y_test)

    kernels = ['linear', 'poly', 'rbf', 'sigmoid']
    results_r2 = []
    results_r2_train = []
    for kernel in kernels:
        regressor_temp = SVR(kernel=kernel)
        regressor_temp.fit(x_train, y_train.flatten())
        preds = regressor_temp.predict(x_test)
        r_squared = r2_score(y_test, preds)
        results_r2.append(r_squared)
        preds, actual = inverse_transform(preds, y_test, scalar_y)
        preds = regressor_temp.predict(x_train)
        results_r2_train.append(r2_score(y_train, preds))
    kernels = np.array(kernels)
    results_r2 = np.array(results_r2)
    table_of_results = np.concatenate((kernels.reshape(
        len(kernels), 1), results_r2.reshape(len(results_r2), 1)),
                                      axis=1)
    print(table_of_results)
示例#2
0
def train(layers, loss_function, show_preds=False, scale_input=True):
    '''
    method for training FuriosaNet

    Parameters
    ==========
    `layers`:
        list of specified number of neurons per layer
    '''

    # generate data with scaled input
    x_train, x_test, y_train, y_test, _, scalar_y, dataset, _ = generate_data(
        "dbs/data_2010s.csv", scale_input=scale_input)
    # x_train, x_test, y_train, y_test, _, scalar_y, dataset, _ = generate_data(
    #         "dbs/data_2010s.csv", drop_features=['title', 'tmdb_id', 'year', 'view_count', 'like_count', 'dislike_count', 'comment_count'])
    # define normalization string for specifying saved filed
    normalization = ""
    if scale_input:
        normalization = "-norm"
    # define the layers of the model, excluding the output layer
    layers = [x_train.shape[1]] + layers
    # call the function that will build the model
    model = build_model(layers, loss_function=loss_function)
    # define what weights we want to save and how we want to save them
    callback = ModelCheckpoint(
        filepath=
        f"weights/nn{normalization}-{loss_function}-{stringify_model(layers)}-1-weights.h5",
        verbose=1,
        save_best_only=True,
        monitor="val_" + loss_function,
        save_weights_only=True)

    # train the network
    results = model.fit(x_train,
                        y_train,
                        batch_size=50,
                        epochs=100,
                        validation_data=(x_test, y_test),
                        callbacks=[callback])

    # plot the history of the model based on the loss_function
    plot_history(results, layers, loss_function, norm=normalization)

    if show_preds:
        # get the rescaled predictions
        predictions, actual_values = inverse_transform(model.predict(x_test),
                                                       y_test, scalar_y)
        # plot the predictions and get the r-squared value of the model
        r_squared = r2_score(predictions, actual_values)
        plot_predictions(predictions,
                         actual_values,
                         r_squared,
                         layers=layers,
                         norm=normalization)
示例#3
0
def test(weights_file,
         layers,
         data_file="dbs/data_2010s.csv",
         create_fig=True,
         show_fig=True,
         scale_input=True):
    '''
    test a model with specified `layers` architecture using the `weights_file`

    Parameters
    ==========
    `weights_file`:
        path to the .h5 file containing the pretrained weights

    `layers`:
        list of integer values specifying the number of nodes at the each hidden layer
        except the final layer

    Keyword Args:
    ==========
    `show_fig`:
        default True; display graph of actual values vs predictions

    Returns
    ==========
    (predictions, actual_values, dataset, test_indices)
    '''
    _, x_test, _, y_test, _, scalar_y, dataset, test_indices = generate_data(
        data_file, scale_input=scale_input)
    # _, x_test, _, y_test, _, scalar_y, dataset, _ = generate_data(
    #         "dbs/data_2010s.csv", drop_features=['title', 'tmdb_id', 'year', 'view_count', 'like_count', 'dislike_count', 'comment_count'])
    model = build_model(layers)
    model.load_weights(weights_file)
    predictions, actual_values = inverse_transform(model.predict(x_test),
                                                   y_test, scalar_y)
    r_squared = r2_score(predictions, actual_values)

    if create_fig:
        plot_predictions(predictions,
                         actual_values,
                         r_squared,
                         layers=layers,
                         best="best-",
                         save_fig=True,
                         show_fig=show_fig)
        df = create_df(predictions, dataset, test_indices)
        create_interactive_plot(df, model=layers)

    return predictions, actual_values, dataset, test_indices
示例#4
0
def overview_test(weights_file):
    _, x_test, _, y_test, _, scalar_y, dataset, test_indices = get_trailer_data(
        "dbs/data_2010s.csv", scale_input=True)
    layers = [
        int(val) for val in weights_file.split("/")[-1].split("-")
        if val.isdigit()
    ][:-1]
    model = build_model(layers)
    model.load_weights(weights_file)
    predictions, actual_values = inverse_transform(model.predict(x_test),
                                                   y_test, scalar_y)
    r_squared = r2_score(predictions, actual_values)
    print(r_squared)
    # plot_predictions(
    #     predictions, actual_values, r_squared, layers=layers, best="best-", save_fig=False, show_fig=show_fig)
    return predictions, actual_values, dataset, test_indices
示例#5
0
def evaluate(metric, weights_folder, save_table=True, create_fig=False):
    '''
    method that will compare all models from a specified `weights_folder`
    based on `metric`

    Parameters
    ==========
    `metric`:
        metric to evaluate models

    `weights_folder`:
        path to fodler containing the weights of model which are meant to be modeled.
    '''
    metric_functions = {
        "r-squared": r2_score,
    }
    predictions = None
    _, x_test, _, y_test, _, scalar_y, dataset, _ = generate_data(
        "dbs/data_2010s.csv", scale_input=True)

    models = dict()
    weights_files = [weights for weights in os.listdir(weights_folder)]
    for weights_file in weights_files:
        layers = get_layers_from_file(weights_file)
        model = build_model(layers)
        model.load_weights(os.path.join(weights_folder, weights_file))
        predictions, actual_values = inverse_transform(model.predict(x_test),
                                                       y_test, scalar_y)
        r2 = metric_functions[metric](actual_values, predictions)
        if create_fig:
            # TODO: plot get some plottable points to be layers on one graph
            print()
        models[weights_file] = r2

    # TODO: show points plotted by each model maybe?
    if create_fig:
        print()
    if save_table:
        pd.DataFrame.from_dict(models, orient='index',
                               columns=["r-squared"
                                        ]).to_csv("model-evaluation.csv")
    else:
        print(f"{'Model':^40s}| {metric}")
        for k in models:
            print(f"{k:^40s}| {models[k]:0.3f}")
    return models
示例#6
0
def overview_mlp(layers):
    # generate data with scaled input
    x_train, x_test, y_train, y_test, _, scalar_y, dataset, test_indices = get_trailer_data(
        "dbs/data_2010s.csv", scale_input=True)
    x_train, x_test = join_with_overview_bayes(x_train, x_test, dataset,
                                               test_indices)
    loss_function = "mean_squared_error"
    # define the layers of the model, excluding the output layer
    layers = [x_train.shape[1]] + layers
    # call the function that will build the model
    model = build_model(layers, loss_function=loss_function)
    # define what weights we want to save and how we want to save them
    callback = ModelCheckpoint(
        filepath=
        f"weights/prereleased-overview-{loss_function}-nn-{stringify_model(layers)}-1-weights.h5",
        verbose=1,
        save_best_only=True,
        monitor="val_" + loss_function,
        save_weights_only=True)

    # train the network
    results = model.fit(x_train,
                        y_train,
                        batch_size=50,
                        epochs=50,
                        validation_data=(x_test, y_test),
                        callbacks=[callback])

    # plot the history of the model based on the loss_function
    plot_history(results,
                 layers,
                 loss_function,
                 norm="-prerelease-overview",
                 show_fig=True)

    # get the rescaled predictions
    predictions, actual_values = inverse_transform(model.predict(x_test),
                                                   y_test, scalar_y)
    r_squared = r2_score(predictions, actual_values)
    print(r_squared)
示例#7
0
def svr(save_fig=False, compare=False):
    '''
    main function for testing Support Vector Regression models

    Keyword Args
    ===========
    `save_fig`:
        if True, save a matplotlib figure AND plotly interactive figure
        of predictions

    `compare`:
        if true, compare values with trailer data to the baseline
    '''
    x_train, x_test, y_train, y_test, _, scalar_y, dataset, test_indices = generate_data(
        "dbs/data_2010s.csv")

    # get parameters for comparing to the "baseline"
    if compare:
        # line is too long
        baseline_data = generate_data(
            "dbs/data_2010s.csv",
            drop_features=[
                'title', 'tmdb_id', 'year', 'view_count',
                'like_count', 'dislike_count', 'comment_count']
            )
        x_train_b, x_test_b, y_train_b, y_test_b, _, _, _, _ = baseline_data
    # define kernels that will be compared
    kernels = ['linear', 'poly', 'rbf', 'sigmoid']

    #initialize lists that will be used to print results
    results_r2 = []
    baseline_results = []

    for kernel in kernels:
        # get preds and r_squared value per model
        preds, r_squared = test(kernel, x_train, x_test, y_train, y_test)
        # add it to the table
        results_r2.append(r_squared)
        if compare:
            # get preds and r_squared value per model for the baseline data
            preds, baseline_r2 = test(kernel, x_train_b, x_test_b, y_train_b, y_test_b)
            # add it to the table
            baseline_results.append(baseline_r2)
        if save_fig:
            # get the actual values of the revenue
            # using the the inverse transform of the data using scalar_y
            preds, actual = inverse_transform(preds, y_test, scalar_y)
            plot_predictions(preds, actual, r_squared, model=f"SVR-{kernel}")
            test_df = create_df(preds, dataset, test_indices)
            create_interactive_plot(test_df, model=f"SVR-{kernel}")
        # to compare without the outlier, uncomment the following line: 
        # remove_ghostbusters(df)

    # create a table of results
    kernels = np.array(kernels)
    results_r2 = np.array(results_r2)
    table_of_results = np.concatenate(
        (kernels.reshape(len(kernels), 1), results_r2.reshape(len(results_r2), 1)),
        axis=1)
    print(table_of_results)

    if compare:
        baseline_results = np.array(baseline_results)
        table_of_results = np.concatenate(
            (kernels.reshape(len(kernels), 1), baseline_results.reshape(len(baseline_results), 1)),
            axis=1)
        print(table_of_results)