def main(): preprocess1 = PreprocessData(PreprocessType.STANDARDISATION_OVER_TENORS, short_end=True) preprocess2 = PreprocessData(PreprocessType.LOG_RETURNS_OVER_TENORS, short_end=True) # 1. get data and apply scaling sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess1.get_data( ) print("sets_test_scaled, sets_training_scaled:", sets_test_scaled[0].shape, sets_training_scaled[0].shape) # 2: log returns of encoded data sets_encoded_log_training = preprocess2.scale_data(sets_training_scaled, training_dataset_names, should_fit=True) sets_encoded_log_test = preprocess2.scale_data(sets_test_scaled, test_dataset_names, should_fit=True) layers = [ 35, 35 ] # Number of hidden neurons in each layer of the encoder and decoder learning_rate = 0.01 decay = 0 # Learning rate decay num_input_features = 1 # The dimensionality of the input at each time step. In this case a 1D signal. num_output_features = 1 # The dimensionality of the output at each time step. In this case a 1D signal. # There is no reason for the input sequence to be of same dimension as the ouput sequence. loss = "mse" # Other loss functions are possible, see Keras documentation. # Regularisation isn't really needed for this application lambda_regulariser = 0.000001 # Will not be used if regulariser is None regulariser = None # Possible regulariser: keras.regularizers.l2(lambda_regulariser) batch_size = 512 steps_per_epoch = 200 # batch_size * steps_per_epoch = total number of training examples epochs = 10 input_sequence_length = 42 # Length of the sequence used by the encoder target_sequence_length = 42 # Length of the sequence predicted by the decoder num_steps_to_predict = 42 # Length to use when testing the model model = Model(layers, learning_rate, decay, num_input_features, num_output_features, loss, lambda_regulariser, regulariser, batch_size, steps_per_epoch, epochs, input_sequence_length, target_sequence_length, num_steps_to_predict) model.build() # model.load() model.train(sets_encoded_log_training) # model.predict_sequences_simple(np.vstack(sets_training_first_last_tenors)) model.predict_sequences(sets_encoded_log_training)
def test_two_preprocessing_methods(self): preprocess = PreprocessData(PreprocessType.STANDARDISATION_OVER_TENORS, short_end=True) preprocess2 = PreprocessData(PreprocessType.LOG_RETURNS_OVER_TENORS, short_end=True) sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess.get_data( ) sets_encoded_log_test = preprocess2.scale_data(sets_test_scaled, test_dataset_names, should_fit=True) # in this case the start_value is required, otherwise it will take the start_value of the original data instead standardised_test_prediction = preprocess2.rescale_data( sets_encoded_log_test[0], test_dataset_names[0], start_value=sets_test_scaled[0][0], index=sets_test_scaled[0].index.values) rescaled_test_prediction = preprocess.rescale_data( standardised_test_prediction, test_dataset_names[0]) # plotting.plot_2d(sets_test[0], "gain_test_prediction_rescaled", curve2=rescaled_test_prediction, title=True) np.testing.assert_allclose(rescaled_test_prediction, sets_test[0])
def simulate(latent_dim=2, preprocess_type1=None, preprocess_type2=None, ae_model=None, gan_model=None, force_training=True, plot=False): preprocess1 = PreprocessData(preprocess_type1) preprocess2 = PreprocessData(preprocess_type2) # 1. get data and apply scaling sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess1.get_data( ) if ae_model is AEModel.AAE: ae_params = { 'preprocess_type': preprocess_type1. value, # only to make preprocess_type part of the hash 'input_dim': sets_training_scaled[0].shape[1], # 56 'latent_dim': latent_dim, 'hidden_layers': ( 56, 40, 28, 12, 4, ), 'hidden_layers_discriminator': ( 2, 2, ), 'leaky_relu': 0.1, 'last_activation': 'linear', 'last_activation_discriminator': 'sigmoid', 'loss_generator': 'mean_squared_error', 'loss_discriminator': 'binary_crossentropy', 'batch_size': 20, 'epochs': 20000 } ae_params_hash = hashlib.md5( json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest() autoencoder = AdversarialAutoencoder(ae_params, plot=False) elif ae_model is AEModel.VAE: ae_params = { 'preprocess_type': preprocess_type1. value, # only to make preprocess_type part of the hash 'input_dim': sets_training_scaled[0].shape[1], # 56 'latent_dim': latent_dim, 'hidden_layers': ( 56, 40, 28, 12, 4, ), 'leaky_relu': 0.1, 'last_activation': 'linear', # sigmoid or linear 'loss': 'mean_square_error', # binary_crossentropy or mean_square_error 'epsilon_std': 1.0, 'batch_size': 20, 'epochs': 100, 'steps_per_epoch': 500 } ae_params_hash = hashlib.md5( json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest() autoencoder = VariationalAutoencoder(ae_params, plot=False) elif ae_model is AEModel.AE: ae_params = { 'preprocess_type': preprocess_type1. value, # only to make preprocess_type part of the hash 'input_dim': sets_training_scaled[0].shape[1], # 56 'latent_dim': latent_dim, 'hidden_layers': ( 56, 40, 28, 12, 4, ), 'leaky_relu': 0.1, 'loss': 'mse', 'last_activation': 'linear', 'batch_size': 20, 'epochs': 100, 'steps_per_epoch': 500 } ae_params_hash = hashlib.md5( json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest() autoencoder = Autoencoder(ae_params, plot=False) else: # elif ae_model is AEModel.PCA: ae_params = { 'preprocess_type': preprocess_type1. value, # only to make preprocess_type part of the hash 'latent_dim': latent_dim } ae_params_hash = hashlib.md5( json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest() autoencoder = PCAModel(ae_params, plot=False) # 2. train/load autoencoder autoencoder.load_else_train(np.vstack(sets_training_scaled), sets_test_scaled, "ae_" + ae_params_hash) # 2: encode data using autoencoder sets_encoded_training = autoencoder.encode(sets_training_scaled) sets_encoded_test = autoencoder.encode(sets_test_scaled) # 3: log returns of encoded data sets_encoded_log_training = preprocess2.scale_data(sets_encoded_training, training_dataset_names, should_fit=True) sets_encoded_log_test = preprocess2.scale_data(sets_encoded_test, test_dataset_names, should_fit=True) num_z = 6 * 7 num_c = 6 * 7 num_o = 6 * 7 if gan_model is GANModel.WGAN: gan_params = { 'ae_params_hash': ae_params_hash, 'num_tenors': sets_encoded_log_training[0].shape[1], 'num_c': num_c, 'num_z': num_z, 'num_o': num_o, 'gen_model_type': 'standard', # conv 'dis_model_type': 'standard', # conv 'gen_layers': (4 * (6 * 7 * 2), ), # 4 * num_o * num_tenors 'dis_layers': (4 * (6 * 7), ), # 4 * num_o 'gen_last_activation': 'tanh', 'dis_last_activation': 'sigmoid', 'loss': 'binary_crossentropy', 'batch_size': 32, 'epochs': 10000, 'sample_interval': 1000 } gan_params_hash = hashlib.md5( json.dumps(gan_params, sort_keys=True).encode('utf-8')).hexdigest() gan = CWGANGP(gan_params, plot=False) else: if gan_model is GANModel.GAN_CONV: model_type = 'conv' else: # if gan_model is GANModel.GAN: model_type = 'standard' gan_params = { 'ae_params_hash': ae_params_hash, 'num_tenors': sets_encoded_log_training[0].shape[1], 'num_c': num_c, 'num_z': num_z, 'num_o': num_o, 'gen_model_type': model_type, # conv 'dis_model_type': model_type, # conv 'gen_layers': (4 * (6 * 7 * 2), ), # 4 * num_o * num_tenors 'dis_layers': (4 * (6 * 7), ), # 4 * num_o 'gen_last_activation': 'tanh', 'dis_last_activation': 'sigmoid', 'loss': 'binary_crossentropy', 'batch_size': 128, 'epochs': 20000 } gan_params_hash = hashlib.md5( json.dumps(gan_params, sort_keys=True).encode('utf-8')).hexdigest() gan = GAN(gan_params, plot=False) # try training on larger input and output if force_training: gan.train(sets_encoded_log_training, "gan_" + gan_params_hash) else: gan.load_else_train(sets_encoded_log_training, "gan_" + gan_params_hash) # 4: simulate on encoded log returns, conditioned on test dataset num_simulations = 100 num_repeats = 1 generated, _ = gan.generate(condition=sets_encoded_log_test[-1], condition_on_end=False, num_simulations=num_simulations, repeat=num_repeats) # insert the last real futures curve in order to do rescaling if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS: generated = np.insert(generated, 0, sets_encoded_log_test[-1].iloc[num_c], axis=1) # 5: undo scaling encoded_generated = preprocess2.rescale_data( generated, start_value=sets_encoded_test[-1][num_c], dataset_name=test_dataset_names[-1]) if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS: encoded_generated = encoded_generated[:, 1:] # remove first curve again # 6: decode using autoencoder decoded_generated_segments = autoencoder.decode(encoded_generated) # 7: undo scaling, this can be log-returns simulated = preprocess1.rescale_data(decoded_generated_segments, start_value=sets_test[-1].iloc[num_c], dataset_name=test_dataset_names[-1]) preprocess1.enable_curve_smoothing = True simulated_smooth = preprocess1.rescale_data( decoded_generated_segments, start_value=sets_test[-1].iloc[num_c], dataset_name=test_dataset_names[-1]) if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS: real = sets_test[-1].iloc[ num_c:num_c + num_o * num_repeats + 1] # `+1` because the log-returns also does +1 else: real = sets_test[-1].iloc[num_c:num_c + num_o * num_repeats + 1] print("simulated, real", simulated.shape, real.shape) smape_result = smape(simulated, real) smape_result_smooth = smape(simulated_smooth, real) print("smape_result_smooth mean and std:", np.mean(smape_result_smooth), np.std(smape_result_smooth)) if plot: plotting = Plotting() plotting.plot_3d("real", real, show_title=False) cov_log_returns = cov_log_returns_over_tenors(real) plotting.plot_3d_cov("gan_real_cov", cov_log_returns, show_title=False) for i in np.arange(1, 11): # name = '_' + preprocess_type1.name + '_' + preprocess_type2.name + '_' + str(latent_dim) + '_' + ae_model.name + '_'+ gan_model.name plotting.plot_3d("gan_simulated_" + str(i), simulated_smooth[i], maturities=maturities, time=real.index.values, show_title=False) smape_result = smape(simulated_smooth[i], real) print("simulated_smooth[i], real", simulated_smooth[i].shape, real.shape) print("simulate rates", i) print("smape:", smape_result) print("=============\n") cov_log_returns = cov_log_returns_over_tenors(simulated_smooth[i]) plotting.plot_3d_cov("gan_simulated_" + str(i) + "_cov", cov_log_returns, maturities=maturities, show_title=False) return smape_result_smooth
def simulate(): plotting = Plotting() preprocess_minmax = PreprocessData() preprocess_logreturns = PreprocessData() preprocess_minmax.enable_min_max_scaler = True preprocess_logreturns.enable_log_returns = True # 1. get data and apply minimax sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess_minmax.get_data( ) print("sets_training_scaled.shape", sets_training_scaled[0].shape) autoencoder = DeepAutoencoder( input_shape=(sets_training_scaled[0].shape[1], ), latent_dim=2) # autoencoder.train(np.vstack(sets_training_scaled), sets_test_scaled, epochs=100, batch_size=5) # autoencoder.save_model("deep_general_minimax") autoencoder.load_model("deep_general_minimax") # 2: encode data using autoencoder sets_encoded_training = [] for set_training_scaled in sets_training_scaled: sets_encoded_training.append(autoencoder.encode(set_training_scaled)) sets_encoded_test = [] for set_test_scaled in sets_test_scaled: sets_encoded_test.append(autoencoder.encode(set_test_scaled)) plotting.plot_2d(sets_encoded_test[0], "encoded test data with deep autoencoder", save=False) # 3: log returns of encoded data sets_encoded_log_training = [] for index, set_encoded_training in enumerate(sets_encoded_training): sets_encoded_log_training.append( preprocess_logreturns.scale_data(set_encoded_training)) sets_encoded_log_test = [] for index, set_encoded_test in enumerate(sets_encoded_test): sets_encoded_log_test.append( preprocess_logreturns.scale_data(set_encoded_test)) plotting.plot_2d( sets_encoded_log_test[0], "encoded test data with deep autoencoder, then log returns", save=False) num_tenors = sets_encoded_log_training[0].shape[1] gan = GAN(num_c=6 * 7, num_z=6 * 7, num_o=6 * 7, num_tenors=num_tenors) # try training on larger input and output # gan.train(sets_encoded_log_training, epochs=20000, batch_size=100, sample_interval=200) # gan.save_model("general_ae") gan.load_model("general_ae") print("sets_encoded_log_test[0].shape", sets_encoded_log_test[0].shape) test_arr = np.full([1, 6 * 7 + 6 * 7, num_tenors], 10) validity = gan.discriminator.predict( test_arr) # np.array(sets_encoded_log_test[0] print(validity) rolled_encoded_log_test = rolling_windows(sets_encoded_log_test[0], 6 * 7 + 6 * 7) validity = gan.discriminator.predict( rolled_encoded_log_test) # np.array(sets_encoded_log_test[0] print(validity)
def simulate(latent_dim=2, preprocess_type1=None, preprocess_type2=None, ae_model=None, plot=False): preprocess1 = PreprocessData(preprocess_type1) preprocess2 = PreprocessData(preprocess_type2) # 1. get data and apply scaling sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess1.get_data() if ae_model is AEModel.AAE: ae_params = {'preprocess_type': preprocess_type1.value, # only to make preprocess_type part of the hash 'input_dim': sets_training_scaled[0].shape[1], # 56 'latent_dim': latent_dim, 'hidden_layers': (56, 40, 28, 12, 4,), 'hidden_layers_discriminator': (2, 2, ), 'leaky_relu': 0.1, 'last_activation': 'linear', 'last_activation_discriminator': 'sigmoid', 'loss_generator': 'mean_squared_error', 'loss_discriminator': 'binary_crossentropy', 'batch_size': 20, 'epochs': 20000} ae_params_hash = hashlib.md5(json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest() autoencoder = AdversarialAutoencoder(ae_params, plot=False) elif ae_model is AEModel.VAE: ae_params = {'preprocess_type': preprocess_type1.value, # only to make preprocess_type part of the hash 'input_dim': sets_training_scaled[0].shape[1], # 56 'latent_dim': latent_dim, 'hidden_layers': (56, 40, 28, 12, 4,), 'leaky_relu': 0.1, 'last_activation': 'linear', # sigmoid or linear 'loss': 'mean_square_error', # binary_crossentropy or mean_square_error 'epsilon_std': 1.0, 'batch_size': 20, 'epochs': 100, 'steps_per_epoch': 500} ae_params_hash = hashlib.md5(json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest() autoencoder = VariationalAutoencoder(ae_params, plot=False) elif ae_model is AEModel.AE: ae_params = {'preprocess_type': preprocess_type1.value, # only to make preprocess_type part of the hash 'input_dim': sets_training_scaled[0].shape[1], # 56 'latent_dim': latent_dim, 'hidden_layers': (56, 40, 28, 12, 4,), 'leaky_relu': 0.1, 'loss': 'mse', 'last_activation': 'linear', 'batch_size': 20, 'epochs': 100, 'steps_per_epoch': 500} ae_params_hash = hashlib.md5(json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest() autoencoder = Autoencoder(ae_params, plot=False) else: # elif ae_model is AEModel.PCA: ae_params = {'preprocess_type': preprocess_type1.value, # only to make preprocess_type part of the hash 'latent_dim': latent_dim } ae_params_hash = hashlib.md5(json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest() autoencoder = PCAModel(ae_params, plot=False) # 2. train/load autoencoder autoencoder.load_else_train(np.vstack(sets_training_scaled), sets_test_scaled, "ae_" + ae_params_hash) # 2: encode data using autoencoder sets_encoded_training = autoencoder.encode(sets_training_scaled) sets_encoded_test = autoencoder.encode(sets_test_scaled) # 3: log returns of encoded data sets_encoded_log_training = preprocess2.scale_data(sets_encoded_training, training_dataset_names, should_fit=True) sets_encoded_log_test = preprocess2.scale_data(sets_encoded_test, test_dataset_names, should_fit=True) print("="*20) print(ae_model.name) print("\n") for set_encoded_log_training, training_dataset_name in zip(sets_encoded_log_training, training_dataset_names): print(training_dataset_name) print("min:", np.min(set_encoded_log_training.min()), "max:", np.max(set_encoded_log_training.max())) print("\n") for set_encoded_log_test, test_dataset_name in zip(sets_encoded_log_test, test_dataset_names): print(test_dataset_name) print("min:", np.min(set_encoded_log_test.min()), "max:", np.max(set_encoded_log_test.max())) print("\n") print("=" * 20)
def simulate(latent_dim=2, preprocess_type1=None, preprocess_type2=None, ae_model=None, gan_model=None, force_training=True, plot=False): preprocess1 = PreprocessData(preprocess_type1, short_end=True) preprocess2 = PreprocessData(preprocess_type2, short_end=True) # 1. get data and apply scaling sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess1.get_data( ) print("sets_test_scaled, sets_training_scaled:", sets_test_scaled[0].shape, sets_training_scaled[0].shape) # 2: log returns of encoded data sets_encoded_log_training = preprocess2.scale_data(sets_training_scaled, training_dataset_names, should_fit=True) sets_encoded_log_test = preprocess2.scale_data(sets_test_scaled, test_dataset_names, should_fit=True) num_c = 6 * 7 num_o = 6 * 7 if gan_model is GANModel.WGAN: gan_params = { 'short_end_encoding': preprocess_type1.name + "_" + preprocess_type2.name, 'num_tenors': sets_encoded_log_training[0].shape[1], 'num_c': 6 * 7, 'num_z': 6 * 7, 'num_o': 6 * 7, 'gen_model_type': 'standard', # conv 'dis_model_type': 'standard', # conv 'gen_layers': (4 * (6 * 7 * 2), ), # 4 * num_o * num_tenors 'dis_layers': (4 * (6 * 7), ), # 4 * num_o 'gen_last_activation': 'tanh', 'dis_last_activation': 'sigmoid', 'loss': 'binary_crossentropy', 'batch_size': 32, 'epochs': 10000, 'sample_interval': 1000 } gan_params_hash = hashlib.md5( json.dumps(gan_params, sort_keys=True).encode('utf-8')).hexdigest() gan = CWGANGP(gan_params, plot=False) else: if gan_model is GANModel.GAN_CONV: model_type = 'conv' else: # if gan_model is GANModel.GAN: model_type = 'standard' print("num tenors:", sets_encoded_log_training[0].shape[1]) gan_params = { 'short_end_encoding': preprocess_type1.name + "_" + preprocess_type2.name, 'num_tenors': sets_encoded_log_training[0].shape[1], 'num_c': num_c, 'num_z': 6 * 7, 'num_o': num_o, 'gen_model_type': model_type, # conv 'dis_model_type': model_type, # conv 'gen_layers': (4 * (6 * 7 * 2), ), # 4 * num_o * num_tenors 'dis_layers': (4 * (6 * 7), ), # 4 * num_o 'gen_last_activation': 'tanh', 'dis_last_activation': 'sigmoid', 'loss': 'binary_crossentropy', 'batch_size': 128, 'epochs': 20000 } gan_params_hash = hashlib.md5( json.dumps(gan_params, sort_keys=True).encode('utf-8')).hexdigest() gan = GAN(gan_params, plot=False) # try training on larger input and output if force_training: gan.train(sets_encoded_log_training, "gan_" + gan_params_hash) else: gan.load_else_train(sets_encoded_log_training, "gan_" + gan_params_hash) # 4: simulate on encoded log returns, conditioned on test dataset num_simulations = 100 num_repeats = 0 print("sets_encoded_log_test[-1]", sets_encoded_log_test[-1].shape) generated, _ = gan.generate(condition=sets_encoded_log_test[-1], condition_on_end=False, num_simulations=num_simulations, repeat=num_repeats) # insert the last real futures curve in order to do rescaling if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS: generated = np.insert(generated, 0, sets_encoded_log_test[-1].iloc[num_c], axis=1) print("sets_test_scaled[-1]", sets_test_scaled[-1].shape) print("sets_test_scaled[-1][num_c]", sets_test_scaled[-1].iloc[num_c]) # 5: undo scaling encoded_generated = preprocess2.rescale_data( generated, start_value=sets_test_scaled[-1].iloc[num_c], dataset_name=test_dataset_names[-1]) if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS: encoded_generated = encoded_generated[:, 1:] # remove first curve again # 7: undo scaling, this can be log-returns simulated = preprocess1.rescale_data(encoded_generated, start_value=sets_test[-1].iloc[num_c], dataset_name=test_dataset_names[-1]) if preprocess_type2 is PreprocessType.LOG_RETURNS_OVER_TENORS: real = np.array( sets_test[-1])[num_c:num_c + num_o + 1] # `+1` because the log-returns also does +1 else: real = np.array(sets_test[-1])[num_c:num_c + num_o + 1] sim = simulated.reshape(100, 43) print("sets_test[-1].iloc[num_c], sim[0][0]", sets_test[-1].iloc[num_c], sim[0][0], sim[1][0], sim[2][0]) print("real, simulated", real.shape, sim.shape) smape_result = smape(sim, real, over_curves=True) if plot: condition_and_real = sets_test[-1].iloc[0:num_c + num_o + 1] plotting = Plotting() plotting.plot_training_sample("simulated_simple", sim, condition_and_real, num_c, after_real_data=True) # print("smape test:", smape(simulated[0], real), smape_result) return smape_result
def simulate(): plotting = Plotting() preprocess_normalisation = PreprocessData() preprocess_logreturns = PreprocessData() preprocess_normalisation.enable_normalisation_scaler = True preprocess_logreturns.enable_log_returns = True # 1. get data and apply pre-processing sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess_normalisation.get_data() ae_params = { 'preprocess_type': PreprocessType.NORMALISATION_OVER_TENORS.value, 'input_dim': (10, sets_training_scaled[0].shape[1],), # 56 'latent_dim': 2*56, 'hidden_layers': (12*56, 4*56, ), 'leaky_relu': 0.1, 'loss': 'mse', 'last_activation': 'linear', 'batch_size': 5, 'epochs': 5, 'steps_per_epoch': 500} ae_params_hash = hashlib.md5(json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest() autoencoder = Autoencoder(ae_params) # autoencoder.train(np.vstack(sets_training_scaled), sets_test_scaled) # autoencoder.save_model("ae_" + ae_params_hash) autoencoder.load_else_train(sets_training_scaled, sets_test_scaled, "ae_" + ae_params_hash) # 2: encode data using autoencoder sets_encoded_training = autoencoder.encode(sets_training_scaled) sets_encoded_test = autoencoder.encode(sets_test_scaled) print("sets_encoded_test", sets_encoded_test[0].shape) plotting.plot_2d(sets_encoded_test[0], "encoded test data with deep autoencoder", save=False) # 3: log returns of encoded data sets_encoded_log_training = preprocess_logreturns.scale_data(sets_encoded_training) sets_encoded_log_test = preprocess_logreturns.scale_data(sets_encoded_test) plotting.plot_2d(sets_encoded_log_test[0], "encoded test data with deep autoencoder, then log returns", save=False) num_c = 6*7 num_o = 6*7 gan_params = {'ae_params_hash': ae_params_hash, 'num_tenors': sets_encoded_log_training[0].shape[1], 'num_c': num_c, 'num_z': 6*7, 'num_o': num_o, 'gen_model_type': 'standard', # conv 'dis_model_type': 'standard', # conv 'gen_layers': (4*(6*7*2),), # 4 * num_o * num_tenors 'dis_layers': (4*(6*7),), # 4 * num_o 'gen_last_activation': 'tanh', 'dis_last_activation': 'sigmoid', 'loss': 'binary_crossentropy', 'batch_size': 128, 'epochs': 20000} gan_params_hash = hashlib.md5(json.dumps(gan_params, sort_keys=True).encode('utf-8')).hexdigest() gan = GAN(gan_params) # try training on larger input and output # gan.train(sets_encoded_log_training, sample_interval=200) # gan.save_model("gan_" + gan_params_hash) gan.load_model("gan_" + gan_params_hash) # COV TEST, TEMPORARY # for name, set in zip(training_dataset_names, sets_training): # print("name:", name) # set_cov_log_returns_over_features = cov_log_returns_over_features(set) # plotting.plot_3d_cov("covariance_time_series_" + name, set_cov_log_returns_over_features, show_title=False) # plotting.plot_3d("time_series_" + name, set, maturities) # END COV TEST. # 4: simulate on encoded log returns, conditioned on test dataset num_simulations = 10 num_repeats = 0 generated, _ = gan.generate(condition=sets_encoded_log_test[-1], condition_on_end=False, num_simulations=num_simulations, repeat=num_repeats) # insert the last real futures curve in order to do rescaling print("sets_encoded_log_test[-1][num_c] shape", sets_encoded_log_test[-1].iloc[num_c].shape) print("generated_segments.shape", generated.shape) generated = np.insert(generated, 0, sets_encoded_log_test[-1].iloc[num_c], axis=0) # 5: undo log-returns # todo: this start_value is actually one off! Error still persists... autoencoder causing the difference? encoded_generated = preprocess_logreturns.rescale_data(generated, start_value=sets_encoded_test[-1][num_c]) encoded_generated = encoded_generated[:, 1:] # remove first curve again # 6: decode using autoencoder decoded_generated_segments = autoencoder.decode(encoded_generated) # 7: undo minimax, for now only the first simulation simulated = preprocess_normalisation.rescale_data(decoded_generated_segments, dataset_name=test_dataset_names[-1]) preprocess_normalisation.enable_curve_smoothing = True simulated_smooth = preprocess_normalisation.rescale_data(decoded_generated_segments, dataset_name=test_dataset_names[-1]) real = np.array(sets_test[-1])[num_c:num_c + num_o] print("simulated, real", simulated.shape, real.shape) smape_result = smape(simulated, real) smape_result_smooth = smape(simulated_smooth, real) print("smape_result and smooth", smape_result, smape_result_smooth) print("smape_resul_smooth", smape_result_smooth)
from helpers.plotting import Plotting from imputance.gain_model import gain import numpy as np import matplotlib.pyplot as plt if __name__ == '__main__': plotting = Plotting() preprocess = PreprocessData(PreprocessType.STANDARDISATION_OVER_TENORS, short_end=True) preprocess2 = PreprocessData(PreprocessType.LOG_RETURNS_OVER_TENORS, short_end=True) sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess.get_data( ) sets_encoded_log_training = preprocess2.scale_data(sets_training_scaled, training_dataset_names, should_fit=True) sets_encoded_log_test = preprocess2.scale_data(sets_test_scaled, test_dataset_names, should_fit=True) train = sets_encoded_log_training[0].copy() test = sets_encoded_log_test[0].copy() # print("train.shape[1]", train.shape[1]) # print("sets_test_scaled[0]", sets_test_scaled[0].shape) # print("sets_encoded_log_test[0]", sets_encoded_log_test[0].shape) params = { 'mb_size': 128, # 'mb_size': 128, 'p_miss': 0.5, # 'p_miss': 0.5, doesn't do anything
def simulate(): plotting = Plotting() preprocess_type = PreprocessType.STANDARDISATION_OVER_TENORS preprocess = PreprocessData(preprocess_type) # 1. get data and apply minimax sets_training, sets_test, sets_training_scaled, sets_test_scaled, training_dataset_names, test_dataset_names, maturities = preprocess.get_data( ) all_training_scaled = np.vstack(sets_training_scaled) ae_params = { 'preprocess_type': preprocess_type.value, # only to make preprocess_type part of the hash 'input_dim': sets_training_scaled[0].shape[1], # 56 'latent_dim': 2, 'hidden_layers': ( 56, 40, 28, 12, 4, ), 'leaky_relu': 0.1, 'loss': 'mse', 'last_activation': 'linear', 'batch_size': 20, 'epochs': 100, 'steps_per_epoch': 500 } ae_params_hash = hashlib.md5( json.dumps(ae_params, sort_keys=True).encode('utf-8')).hexdigest() autoencoder = Autoencoder(ae_params) autoencoder.load_else_train(all_training_scaled, sets_test_scaled, "ae_" + ae_params_hash) # 2: encode data using autoencoder encoded = autoencoder.encode(sets_test_scaled[0]) decoded = autoencoder.decode(encoded) rescaled = preprocess.rescale_data(decoded, dataset_name=test_dataset_names[0]) smape_result = smape(rescaled, np.array(sets_test[0]), over_curves=True) print("smape_result test set", np.mean(smape_result), np.std(smape_result), np.min(smape_result), np.max(smape_result)) plotting.plot_2d(sets_test[0], "evaluation of test curves", timeseries=True, evaluation=smape_result, title=False) # for i in np.arange(len(test_eval)): # if test_eval[i] > 4: # plotting.plot_2d(sets_test_scaled[0][i], "Possible unrealistic curve" + str(i), save=False, title=True) # 3: lets see how well the autoencoder can map a zero vector # todo: generate random curves, THEN apply min-max feature scaling, THEN evaluate unrealistic_curves = [] curve_shape = 56 unrealistic_curves.append(np.full(curve_shape, 5)) unrealistic_curves.append(np.full(curve_shape, 10)) unrealistic_curves.append(np.full(curve_shape, 20)) unrealistic_curves.append(np.full(curve_shape, 50)) unrealistic_curves.append(np.full(curve_shape, 70)) unrealistic_curves.append(np.full(curve_shape, 100)) unrealistic_curves.append(np.full(curve_shape, 150)) unrealistic_curves.append(np.full(curve_shape, 200)) unrealistic_curves.append(np.full(curve_shape, 250)) unrealistic_curves.append(np.full(curve_shape, 300)) unrealistic_curves.append( np.hstack((np.full(int(curve_shape / 2), 50), np.full(int(curve_shape / 2), 150)))) unrealistic_curves.append( np.hstack((np.full(int(curve_shape / 2), 100), np.full(int(curve_shape / 2), 150)))) unrealistic_curves.append( np.hstack((np.full(int(curve_shape / 2), 100), np.full(int(curve_shape / 2), 200)))) unrealistic_curves.append(np.random.uniform(0, 10, curve_shape)) unrealistic_curves.append(np.random.uniform(10, 70, curve_shape)) unrealistic_curves.append(np.random.uniform(0, 100, curve_shape)) unrealistic_curves.append(np.random.uniform(100, 200, curve_shape)) unrealistic_curves.append(np.random.uniform(200, 300, curve_shape)) unrealistic_curves.append(np.random.uniform(0, 200, curve_shape)) unrealistic_curves.append(np.random.uniform(0, 250, curve_shape)) unrealistic_curves.append(np.random.uniform(0, 300, curve_shape)) unrealistic_curves.append(np.linspace(0, 100, num=curve_shape)) unrealistic_curves.append(np.linspace(50, 150, num=curve_shape)) unrealistic_curves.append(np.linspace(100, 200, num=curve_shape)) unrealistic_curves.append(np.linspace(150, 250, num=curve_shape)) unrealistic_curves.append(np.linspace(200, 300, num=curve_shape)) unrealistic_curves.append(np.linspace(0, 200, num=curve_shape)) unrealistic_curves.append(np.linspace(0, 300, num=curve_shape)) unrealistic_curves.append(np.linspace(100, 0, num=curve_shape)) unrealistic_curves.append(np.linspace(150, 50, num=curve_shape)) unrealistic_curves.append(np.linspace(200, 100, num=curve_shape)) unrealistic_curves.append(np.linspace(250, 150, num=curve_shape)) unrealistic_curves.append(np.linspace(300, 200, num=curve_shape)) unrealistic_curves.append(np.linspace(200, 0, num=curve_shape)) unrealistic_curves.append(np.linspace(300, 0, num=curve_shape)) unrealistic_curves = np.array(unrealistic_curves) print("unrealistic_curves.shape", unrealistic_curves.shape) unrealistic_curves_scaled = preprocess.scale_data( unrealistic_curves, dataset_name=training_dataset_names[0], should_fit=True) encoded = autoencoder.encode(unrealistic_curves_scaled) decoded = autoencoder.decode(encoded) rescaled = preprocess.rescale_data(decoded, dataset_name=training_dataset_names[0]) smape_result = smape(rescaled, unrealistic_curves, over_curves=True) round_to_n = lambda x, n: round(x, -int(np.floor(np.log10(x))) + (n - 1)) print("smape results", smape_result) for a_smape_result in smape_result: print(round_to_n(a_smape_result, 2)) plotting.plot_2d(smape_result, "loss of unrealistic curves from autoencoder SMAPE", save=False, title=True) plotting.plot_2d(smape_result, "loss of unrealistic curves from autoencoder SMAPE", save=False, title=True) # plotting.plot_2d(unrealistic_eval_mse, "loss of unrealistic curves from autoencoder MSE", save=False, title=True) plotting.plot_unrealisticness( unrealistic_curves, "loss of unrealistic curves from autoencoder", timeseries=True, evaluation=smape_result, title=False, eval_label="SMAPE")