def support_vector(x_train, x_test, y_train, y_test): ''' support vector try ''' x_train, x_test, y_train, y_test, scalar_x, scalar_y = scale_data( x_train, x_test, y_train, y_test) kernels = ['linear', 'poly', 'rbf', 'sigmoid'] results_r2 = [] results_r2_train = [] for kernel in kernels: regressor_temp = SVR(kernel=kernel) regressor_temp.fit(x_train, y_train.flatten()) preds = regressor_temp.predict(x_test) r_squared = r2_score(y_test, preds) results_r2.append(r_squared) preds, actual = inverse_transform(preds, y_test, scalar_y) preds = regressor_temp.predict(x_train) results_r2_train.append(r2_score(y_train, preds)) kernels = np.array(kernels) results_r2 = np.array(results_r2) table_of_results = np.concatenate((kernels.reshape( len(kernels), 1), results_r2.reshape(len(results_r2), 1)), axis=1) print(table_of_results)
def train(layers, loss_function, show_preds=False, scale_input=True): ''' method for training FuriosaNet Parameters ========== `layers`: list of specified number of neurons per layer ''' # generate data with scaled input x_train, x_test, y_train, y_test, _, scalar_y, dataset, _ = generate_data( "dbs/data_2010s.csv", scale_input=scale_input) # x_train, x_test, y_train, y_test, _, scalar_y, dataset, _ = generate_data( # "dbs/data_2010s.csv", drop_features=['title', 'tmdb_id', 'year', 'view_count', 'like_count', 'dislike_count', 'comment_count']) # define normalization string for specifying saved filed normalization = "" if scale_input: normalization = "-norm" # define the layers of the model, excluding the output layer layers = [x_train.shape[1]] + layers # call the function that will build the model model = build_model(layers, loss_function=loss_function) # define what weights we want to save and how we want to save them callback = ModelCheckpoint( filepath= f"weights/nn{normalization}-{loss_function}-{stringify_model(layers)}-1-weights.h5", verbose=1, save_best_only=True, monitor="val_" + loss_function, save_weights_only=True) # train the network results = model.fit(x_train, y_train, batch_size=50, epochs=100, validation_data=(x_test, y_test), callbacks=[callback]) # plot the history of the model based on the loss_function plot_history(results, layers, loss_function, norm=normalization) if show_preds: # get the rescaled predictions predictions, actual_values = inverse_transform(model.predict(x_test), y_test, scalar_y) # plot the predictions and get the r-squared value of the model r_squared = r2_score(predictions, actual_values) plot_predictions(predictions, actual_values, r_squared, layers=layers, norm=normalization)
def test(weights_file, layers, data_file="dbs/data_2010s.csv", create_fig=True, show_fig=True, scale_input=True): ''' test a model with specified `layers` architecture using the `weights_file` Parameters ========== `weights_file`: path to the .h5 file containing the pretrained weights `layers`: list of integer values specifying the number of nodes at the each hidden layer except the final layer Keyword Args: ========== `show_fig`: default True; display graph of actual values vs predictions Returns ========== (predictions, actual_values, dataset, test_indices) ''' _, x_test, _, y_test, _, scalar_y, dataset, test_indices = generate_data( data_file, scale_input=scale_input) # _, x_test, _, y_test, _, scalar_y, dataset, _ = generate_data( # "dbs/data_2010s.csv", drop_features=['title', 'tmdb_id', 'year', 'view_count', 'like_count', 'dislike_count', 'comment_count']) model = build_model(layers) model.load_weights(weights_file) predictions, actual_values = inverse_transform(model.predict(x_test), y_test, scalar_y) r_squared = r2_score(predictions, actual_values) if create_fig: plot_predictions(predictions, actual_values, r_squared, layers=layers, best="best-", save_fig=True, show_fig=show_fig) df = create_df(predictions, dataset, test_indices) create_interactive_plot(df, model=layers) return predictions, actual_values, dataset, test_indices
def overview_test(weights_file): _, x_test, _, y_test, _, scalar_y, dataset, test_indices = get_trailer_data( "dbs/data_2010s.csv", scale_input=True) layers = [ int(val) for val in weights_file.split("/")[-1].split("-") if val.isdigit() ][:-1] model = build_model(layers) model.load_weights(weights_file) predictions, actual_values = inverse_transform(model.predict(x_test), y_test, scalar_y) r_squared = r2_score(predictions, actual_values) print(r_squared) # plot_predictions( # predictions, actual_values, r_squared, layers=layers, best="best-", save_fig=False, show_fig=show_fig) return predictions, actual_values, dataset, test_indices
def evaluate(metric, weights_folder, save_table=True, create_fig=False): ''' method that will compare all models from a specified `weights_folder` based on `metric` Parameters ========== `metric`: metric to evaluate models `weights_folder`: path to fodler containing the weights of model which are meant to be modeled. ''' metric_functions = { "r-squared": r2_score, } predictions = None _, x_test, _, y_test, _, scalar_y, dataset, _ = generate_data( "dbs/data_2010s.csv", scale_input=True) models = dict() weights_files = [weights for weights in os.listdir(weights_folder)] for weights_file in weights_files: layers = get_layers_from_file(weights_file) model = build_model(layers) model.load_weights(os.path.join(weights_folder, weights_file)) predictions, actual_values = inverse_transform(model.predict(x_test), y_test, scalar_y) r2 = metric_functions[metric](actual_values, predictions) if create_fig: # TODO: plot get some plottable points to be layers on one graph print() models[weights_file] = r2 # TODO: show points plotted by each model maybe? if create_fig: print() if save_table: pd.DataFrame.from_dict(models, orient='index', columns=["r-squared" ]).to_csv("model-evaluation.csv") else: print(f"{'Model':^40s}| {metric}") for k in models: print(f"{k:^40s}| {models[k]:0.3f}") return models
def overview_mlp(layers): # generate data with scaled input x_train, x_test, y_train, y_test, _, scalar_y, dataset, test_indices = get_trailer_data( "dbs/data_2010s.csv", scale_input=True) x_train, x_test = join_with_overview_bayes(x_train, x_test, dataset, test_indices) loss_function = "mean_squared_error" # define the layers of the model, excluding the output layer layers = [x_train.shape[1]] + layers # call the function that will build the model model = build_model(layers, loss_function=loss_function) # define what weights we want to save and how we want to save them callback = ModelCheckpoint( filepath= f"weights/prereleased-overview-{loss_function}-nn-{stringify_model(layers)}-1-weights.h5", verbose=1, save_best_only=True, monitor="val_" + loss_function, save_weights_only=True) # train the network results = model.fit(x_train, y_train, batch_size=50, epochs=50, validation_data=(x_test, y_test), callbacks=[callback]) # plot the history of the model based on the loss_function plot_history(results, layers, loss_function, norm="-prerelease-overview", show_fig=True) # get the rescaled predictions predictions, actual_values = inverse_transform(model.predict(x_test), y_test, scalar_y) r_squared = r2_score(predictions, actual_values) print(r_squared)
def svr(save_fig=False, compare=False): ''' main function for testing Support Vector Regression models Keyword Args =========== `save_fig`: if True, save a matplotlib figure AND plotly interactive figure of predictions `compare`: if true, compare values with trailer data to the baseline ''' x_train, x_test, y_train, y_test, _, scalar_y, dataset, test_indices = generate_data( "dbs/data_2010s.csv") # get parameters for comparing to the "baseline" if compare: # line is too long baseline_data = generate_data( "dbs/data_2010s.csv", drop_features=[ 'title', 'tmdb_id', 'year', 'view_count', 'like_count', 'dislike_count', 'comment_count'] ) x_train_b, x_test_b, y_train_b, y_test_b, _, _, _, _ = baseline_data # define kernels that will be compared kernels = ['linear', 'poly', 'rbf', 'sigmoid'] #initialize lists that will be used to print results results_r2 = [] baseline_results = [] for kernel in kernels: # get preds and r_squared value per model preds, r_squared = test(kernel, x_train, x_test, y_train, y_test) # add it to the table results_r2.append(r_squared) if compare: # get preds and r_squared value per model for the baseline data preds, baseline_r2 = test(kernel, x_train_b, x_test_b, y_train_b, y_test_b) # add it to the table baseline_results.append(baseline_r2) if save_fig: # get the actual values of the revenue # using the the inverse transform of the data using scalar_y preds, actual = inverse_transform(preds, y_test, scalar_y) plot_predictions(preds, actual, r_squared, model=f"SVR-{kernel}") test_df = create_df(preds, dataset, test_indices) create_interactive_plot(test_df, model=f"SVR-{kernel}") # to compare without the outlier, uncomment the following line: # remove_ghostbusters(df) # create a table of results kernels = np.array(kernels) results_r2 = np.array(results_r2) table_of_results = np.concatenate( (kernels.reshape(len(kernels), 1), results_r2.reshape(len(results_r2), 1)), axis=1) print(table_of_results) if compare: baseline_results = np.array(baseline_results) table_of_results = np.concatenate( (kernels.reshape(len(kernels), 1), baseline_results.reshape(len(baseline_results), 1)), axis=1) print(table_of_results)