def plot_value_dist(self, w0, n): aav = AAV(self.params) u = -aav.u(self.params.lambda0, w0) mc = self.value_acc(w0, n) fig, ax = plt.subplots() sns.distplot(mc, ax=ax) ax.axvline(np.mean(mc), color='red', linestyle='--') ax.axvline(u, color='green', linestyle='--') fig.show()
def plot_value_dist(self, l0, w0, n): aav = AAV(self.params) u = -aav.u(l0, w0) mc = self.value_acc(l0, w0, n) fig, ax = plt.subplots() sns.distplot(mc, ax=ax) ax.axvline(np.mean(mc), color='red', linestyle='--') ax.axvline(u, color='green', linestyle='--') label = self.name + ' dt:' + str(self.dt) ax.set_title(label) fig.show() return mc
def sim_value(self, l, w, n=5000): means = np.zeros_like(self.dts) stds = np.zeros_like(self.dts) for i, dt in enumerate(self.dts): vals = self.simulator.value_acc(l, w, n) means[i] = np.mean(vals) stds[i] = np.std(vals) aav = AAV(self.simulator.env.params) u = -aav.u(self.simulator.params.lambda0, w) fig, ax = plt.subplots() ax.plot(self.dts, means, marker='x') ax.fill_between(self.dts, means - stds, means + stds, alpha=0.5, color='salmon') ax.plot(self.dts, np.ones_like(self.dts) * u) fig.show()
def test3(): """ Test the account value Returns: """ lambda0 = 1 w0 = 100 dt = 0.01 s0 = np.array([lambda0, w0], dtype=np.float32) p = Parameters() env = CollectionsEnv() reward_per_episode = [] n_epochs = 1000 for epoch in range(n_epochs): # print(f'start in state: {env.current_state}') env.reset() # print(f'reset in state: {env.current_state}') tot_reward_per_episode = 0 lambdas = [] ws = [] # print(f'Start state: {env.starting_state}') while True: action = 0 state_next, reward, terminal, _ = env.step(action) # print(f'Start state: {env.starting_state}') ws.append(state_next[1]) tot_reward_per_episode += reward lambdas.append(state_next[0]) if terminal: reward_per_episode.append(tot_reward_per_episode) break aav = AAV(p) exact_v = -aav.u(lambda0, w0) plt.show() plt.plot(reward_per_episode, marker='x') plt.axhline(exact_v, color='red', linestyle='--') plt.show() print(exact_v) print(np.mean(reward_per_episode))
def construct_lattice(env, config, initialize=False): """ Constructs deep lattice network with 832 parameters Returns: lattice model instance """ min_w, min_l = (env.MIN_ACCOUNT_BALANCE, env.params.lambdainf) max_w, max_l = (env.w0, env.MAX_LAMBDA) ## Calibrators Block combined_calibratorsl = [] combined_calibratorsr = [] lattice_units_layer = [3, 3, 1] n_lattice_points = [3, 3, 3] for i in range(lattice_units_layer[0]): calibration_layer_l_l = tfl.layers.PWLCalibration(input_keypoints=np.linspace(0, max_l, num=100), dtype=tf.float32, output_min=0.0, output_max=n_lattice_points[0] - 1.0, monotonicity='increasing', convexity='concave') calibration_layer_w_l = tfl.layers.PWLCalibration(input_keypoints=np.linspace(0, max_w, num=100), dtype=tf.float32, output_min=0.0, output_max=n_lattice_points[0] - 1.0, monotonicity='increasing', convexity='convex') calibration_layer_l_r = tfl.layers.PWLCalibration(input_keypoints=np.linspace(0, max_l, num=100), dtype=tf.float32, output_min=0.0, output_max=n_lattice_points[0] - 1.0, monotonicity='increasing', convexity='concave') calibration_layer_w_r = tfl.layers.PWLCalibration(input_keypoints=np.linspace(0, max_w, num=100), dtype=tf.float32, output_min=0.0, output_max=n_lattice_points[0] - 1.0, monotonicity='increasing', convexity='convex') combined_calibratorsl.append(calibration_layer_l_l) combined_calibratorsl.append(calibration_layer_w_l) combined_calibratorsr.append(calibration_layer_l_r) combined_calibratorsr.append(calibration_layer_w_r) input_callibratorsl = tfl.layers.ParallelCombination(combined_calibratorsl, single_output=True) input_callibratorsr = tfl.layers.ParallelCombination(combined_calibratorsr, single_output=True) # API FUNCTIONAL MODEL inputs = keras.Input(shape=(2,)) repeated_input = keras.layers.RepeatVector(lattice_units_layer[0])(inputs) repeated_input = keras.layers.Flatten()(repeated_input) # left tree calibrator1l = input_callibratorsl(repeated_input) calibrator1l = keras.layers.Reshape((lattice_units_layer[0], 2))(calibrator1l) lattice1l = tfl.layers.Lattice(units=lattice_units_layer[0], lattice_sizes=[n_lattice_points[0]] * 2, monotonicities=2 * ['increasing'], output_min=0, output_max=1)(calibrator1l) lattice1l = keras.layers.RepeatVector(lattice_units_layer[1])(lattice1l) lattice1l = keras.layers.Flatten()(lattice1l) calibrator2l = tfl.layers.PWLCalibration(input_keypoints=np.linspace(0, 1, num=100), units=lattice_units_layer[0] * lattice_units_layer[1], dtype=tf.float32, output_min=0.0, output_max=n_lattice_points[1] - 1.0, monotonicity='increasing')(lattice1l) calibrator2l = keras.layers.Reshape((lattice_units_layer[1], lattice_units_layer[0]))(calibrator2l) lattice2l = tfl.layers.Lattice(units=lattice_units_layer[1], lattice_sizes=[n_lattice_points[1]] * lattice_units_layer[0], monotonicities=['increasing'] * lattice_units_layer[0], output_min=0, output_max=1)( calibrator2l) calibrator3l = tfl.layers.PWLCalibration(units=lattice_units_layer[1] * lattice_units_layer[2], input_keypoints=np.linspace(0, 1, 100), dtype=tf.float32, monotonicity='increasing', output_min=0, output_max=n_lattice_points[2] - 1.0)( lattice2l) lattice3l = tfl.layers.Lattice(units=lattice_units_layer[2], lattice_sizes=[n_lattice_points[2]] * lattice_units_layer[1], monotonicities=['increasing'] * lattice_units_layer[0], output_min=0, output_max=1)( calibrator3l) calibratorfl = tfl.layers.PWLCalibration(units=lattice_units_layer[2], input_keypoints=np.linspace(0, 1, 100), dtype=tf.float32, monotonicity='increasing')(lattice3l) # right tree calibrator1r = input_callibratorsr(repeated_input) calibrator1r = keras.layers.Reshape((lattice_units_layer[0], 2))(calibrator1r) lattice1r = tfl.layers.Lattice(units=lattice_units_layer[0], lattice_sizes=[n_lattice_points[0]] * 2, monotonicities=2 * ['increasing'], output_min=0, output_max=1)(calibrator1r) lattice1r = keras.layers.RepeatVector(lattice_units_layer[1])(lattice1r) lattice1r = keras.layers.Flatten()(lattice1l) calibrator2r = tfl.layers.PWLCalibration(input_keypoints=np.linspace(0, 1, num=100), units=lattice_units_layer[0] * lattice_units_layer[1], dtype=tf.float32, output_min=0.0, output_max=n_lattice_points[1] - 1.0, monotonicity='increasing')(lattice1r) calibrator2r = keras.layers.Reshape((lattice_units_layer[1], lattice_units_layer[0]))(calibrator2r) lattice2r = tfl.layers.Lattice(units=lattice_units_layer[1], lattice_sizes=[n_lattice_points[1]] * lattice_units_layer[0], monotonicities=['increasing'] * lattice_units_layer[0], output_min=0, output_max=1)( calibrator2r) calibrator3r = tfl.layers.PWLCalibration(units=lattice_units_layer[1] * lattice_units_layer[2], input_keypoints=np.linspace(0, 1, 100), dtype=tf.float32, monotonicity='increasing', output_min=0, output_max=n_lattice_points[2] - 1.0)( lattice2r) lattice3r = tfl.layers.Lattice(units=lattice_units_layer[2], lattice_sizes=[n_lattice_points[2]] * lattice_units_layer[1], monotonicities=['increasing'] * lattice_units_layer[0], output_min=0, output_max=1)( calibrator3r) calibratorfr = tfl.layers.PWLCalibration(units=lattice_units_layer[2], input_keypoints=np.linspace(0, 1, 100), dtype=tf.float32, monotonicity='increasing')(lattice3r) # Combine final = keras.layers.Concatenate()([calibratorfl, calibratorfr]) model = keras.Model(inputs=inputs, outputs=final, name="combined") model.compile(loss=tf.keras.losses.mean_squared_error, optimizer='adam', metrics=['mean_absolute_percentage_error']) if initialize: # works only for one discrete action aav = AAV(env.params) if config.normalize_states: lowbounds = env.observation(np.array([env.params.lambda0, env.MIN_ACCOUNT_BALANCE])) highbounds = env.observation(np.array([env.MAX_LAMBDA, env.w0])) ws = np.linspace(lowbounds[1], highbounds[1], 100) ls = np.linspace(lowbounds[0], highbounds[0], 100) else: ws = np.linspace(0, env.w0, 100) ls = np.linspace(0, env.MAX_LAMBDA, 100) wt = np.linspace(0, env.w0, 100) lt = np.linspace(0, env.MAX_LAMBDA, 100) ww, ll = np.meshgrid(ws, ls) z = np.zeros_like(ww) zt = np.zeros_like(ww) features = [] for i, wx in enumerate(ws): for j, ly in enumerate(ls): z[j, i] = -aav.u(lt[j], wt[i]) zt[j, i] = -aav.u(lt[j] + env.action(1), wt[i]) - env.action(1) * env.params.c features.append([ls[j], ws[i], z[j, i], zt[j, i]]) dataset = pd.DataFrame(features, columns=['l', 'w', 'target', 'target2']) train_dataset = dataset.sample(frac=1.0, random_state=0) # test_dataset = dataset.drop(train_dataset.index) train_labels = train_dataset[['target', 'target2']].copy() # test_labels = test_dataset[['target', 'target2']].copy() train_dataset = train_dataset.drop(labels=['target', 'target2'], axis=1) # test_dataset = test_dataset.drop(labels=['target', 'target2'], axis=1) callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5) model.fit(train_dataset.to_numpy(), train_labels.to_numpy(), epochs=50, validation_split=0.1, shuffle=False, verbose=True, callbacks=[callback]) lattice_pred = np.zeros_like(ww) lattice_pred2 = np.zeros_like(ww) for i, wx in enumerate(ws): for j, ly in enumerate(ls): obs = np.array([ls[j], ws[i]]) pred = model.predict_on_batch(obs[None, :]).numpy() lattice_pred[j, i] = pred[0][0] lattice_pred2[j, i] = pred[0][1] fig, ax = plt.subplots(ncols=2, nrows=1) CS = ax[0].contour(ww, ll, lattice_pred) ax[0].clabel(CS, inline=1, fontsize=10) ax[0].set_title('vf') CS = ax[1].contour(ww, ll, lattice_pred2) ax[1].clabel(CS, inline=1, fontsize=10) ax[1].set_title('vf') plt.show() ## w_points = 60 l_points = 60 l = np.linspace(0, 5, l_points) w = np.linspace(0, 200, w_points) ww, ll = np.meshgrid(w, l) z = np.zeros_like(ww) p = np.zeros_like(ww) for i, xp in enumerate(w): for j, yp in enumerate(l): fixed_obs = np.array([ls[j], ws[i]]) z[j, i] = np.argmax(model.predict_on_batch(fixed_obs[None, :]).numpy().flatten()) fig, ax = plt.subplots(nrows=1, ncols=2) im = ax[0].pcolor(ww, ll, p) cdict = { 'red': ((0.0, 0.25, .25), (0.02, .59, .59), (1., 1., 1.)), 'green': ((0.0, 0.0, 0.0), (0.02, .45, .45), (1., .97, .97)), 'blue': ((0.0, 1.0, 1.0), (0.02, .75, .75), (1., 0.45, 0.45)) } cm = m.colors.LinearSegmentedColormap('my_colormap', cdict, 1024) im = ax[0].pcolor(ww, ll, z, cmap=cm) fig.colorbar(im) fig.show() return model
return (0.001 * (w - shift)).clip(min=0) elif np.isscalar(w): return np.maximum(0.001 * (w - shift), 0) else: raise ValueError('Stupid error in w.') l0 = 1 w0 = 100 params = Parameters() chp = CHP(starting_balance=w0, starting_intensity=l0, params=params, collection_horizon=100, value_precision_thershold=1e-3, control_function=None) print(chp.calculate_value_single()) chp.plot_statespace() chp.plot_intensity() chp.plot_policy() chp.plot_balance() chp.test_increments() mc_vals = chp.calculate_value(10000, returntype='array') print(np.mean(mc_vals)) aav = AAV(params) print(aav.u(l0, w0)) sns.distplot(mc_vals) plt.show()
def __init__(self, env, name, config=None, training=True): super().__init__(env=env, name=name, config=config, training=training) self.aav = AAV(self.env.params)
class DQNAgentBaselined(DQNAgent): def __init__(self, env, name, config=None, training=True): super().__init__(env=env, name=name, config=config, training=training) self.aav = AAV(self.env.params) def train(self, *args, **kwargs): batch = self.memory.sample(self.config.batch_size) states = batch['s'] actions = batch['a'] rewards = batch['r'] next_states = batch['s_next'] dones = batch['done'] if self.config.prioritized_memory_replay: idx = batch['indices'] weights = batch['weights'] dqn_variable = self.main_net.trainable_variables network_input_to_watch = tf.Variable( tf.convert_to_tensor(states, dtype='float32')) with tf.GradientTape() as tape: tape.watch(dqn_variable) with tf.GradientTape() as tape_inner: tape_inner.watch(network_input_to_watch) # simple dqn # target_q = self.target_net.predict_on_batch(next_states) # next_action = np.argmax(target_q.numpy(), axis=1) # double_dqn -- this is a feature, not a bug target_q = self.main_net(next_states) next_action = np.argmax(target_q.numpy(), axis=1) target_q = self.target_net(next_states) target_value = tf.reduce_sum( tf.one_hot(next_action, self.act_size) * target_q, axis=1) target_value = ( 1 - dones) * self.config.gamma * target_value + rewards + [ self.aav.u(*self.env.convert_back(s)) for s in states ] main_q = self.main_net(network_input_to_watch) main_value = tf.reduce_sum(tf.one_hot(actions, self.act_size) * main_q, axis=1) # main_q_to_penalize = tf.identity(main_q) # penalization = self.config.penal_coeff * tf.reduce_sum( # tf.maximum(-tape_inner.gradient(main_value, network_input_to_watch), 0.0)) td_error = target_value - main_value element_wise_loss = tf.square(td_error) * 0.5 if self.config.prioritized_memory_replay: error = tf.reduce_mean(element_wise_loss * weights) else: error = tf.reduce_mean(element_wise_loss) error = error # + penalization dqn_grads = tape.gradient(error, dqn_variable) # Update priorities if self.config.prioritized_memory_replay: self.memory.update_priorities( idx, np.abs(td_error.numpy()) + self.config.prior_eps) self.optimizer.apply_gradients(zip(dqn_grads, dqn_variable)) # Logging self.global_step += 1 self._tb_log_holder = { 'Gradients': dqn_grads[0], 'Weights': tf.convert_to_tensor(self.main_net.weights[0]), 'Prediction': main_value, 'Target': target_value, 'TD error': td_error, 'Elementwise Loss': element_wise_loss, 'Loss': tf.reduce_mean(element_wise_loss) } #, 'Penalization': penalization} return tf.reduce_sum(element_wise_loss)
def construct_nn(env, config, initialize=False): target_net = tf.keras.Sequential() target_net.add(tf.keras.layers.Input(shape=env.observation_space.shape)) for i, layer_size in enumerate(config.layers): if config.regularizer is None: target_net.add(tf.keras.layers.Dense(layer_size, activation='relu')) elif config.regularizer == "l1": target_net.add(tf.keras.layers.Dense(layer_size, activation='relu', kernel_regularizer=tf.keras.regularizers.l1( config.regularizer_parameter))) elif config.regularizer == "l2": target_net.add(tf.keras.layers.Dense(layer_size, activation='relu', kernel_regularizer=tf.keras.regularizers.l2( config.regularizer_parameter))) elif config.regularizer == "l1_l2": target_net.add(tf.keras.layers.Dense(layer_size, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2( l1=config.regularizer_parameter, l2=config.regularizer_parameter))) if config.batch_normalization: target_net.add(tf.keras.layers.BatchNormalization()) target_net.add(tf.keras.layers.Dense(env.action_space.n, activation='linear')) target_net.build() target_net.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError()) if initialize: # works only for one discrete action aav = AAV(env.params) if config.normalize_states: lowbounds = env.observation(np.array([env.params.lambda0, env.MIN_ACCOUNT_BALANCE])) highbounds = env.observation(np.array([env.MAX_LAMBDA, env.w0])) ws = np.linspace(lowbounds[1], highbounds[1], 100) ls = np.linspace(lowbounds[0], highbounds[0], 100) else: ws = np.linspace(0, env.w0, 100) ls = np.linspace(0, env.MAX_LAMBDA, 100) wt = np.linspace(0, env.w0, 100) lt = np.linspace(0, env.MAX_LAMBDA, 100) ww, ll = np.meshgrid(ws, ls) z = np.zeros_like(ww) zt = np.zeros_like(ww) features = [] for i, wx in enumerate(ws): for j, ly in enumerate(ls): z[j, i] = -aav.u(lt[j], wt[i]) zt[j, i] = -aav.u(lt[j] + env.action(1), wt[i]) - env.action(1) * env.params.c features.append([ls[j], ws[i], z[j, i], zt[j, i]]) dataset = pd.DataFrame(features, columns=['l', 'w', 'target', 'target2']) train_dataset = dataset.sample(frac=0.8, random_state=0) test_dataset = dataset.drop(train_dataset.index) train_labels = train_dataset[['target', 'target2']].copy() test_labels = test_dataset[['target', 'target2']].copy() train_dataset = train_dataset.drop(labels=['target', 'target2'], axis=1) test_dataset = test_dataset.drop(labels=['target', 'target2'], axis=1) callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5) target_net.fit(train_dataset.to_numpy(), train_labels.to_numpy(), epochs=1000, validation_split=0.1, shuffle=False, verbose=True, callbacks=[callback]) lattice_pred = np.zeros_like(ww) lattice_pred2 = np.zeros_like(ww) for i, wx in enumerate(ws): for j, ly in enumerate(ls): obs = np.array([ls[j], ws[i]]) pred = target_net.predict_on_batch(obs[None, :]).numpy() lattice_pred[j, i] = pred[0][0] lattice_pred2[j, i] = pred[0][1] fig, ax = plt.subplots(ncols=2, nrows=1) CS = ax[0].contour(ww, ll, lattice_pred) ax[0].clabel(CS, inline=1, fontsize=10) ax[0].set_title('vf') CS = ax[1].contour(ww, ll, lattice_pred2) ax[1].clabel(CS, inline=1, fontsize=10) ax[1].set_title('vf') plt.show() ## w_points = 60 l_points = 60 l = np.linspace(0, 5, l_points) w = np.linspace(0, 200, w_points) ww, ll = np.meshgrid(w, l) z = np.zeros_like(ww) p = np.zeros_like(ww) for i, xp in enumerate(w): for j, yp in enumerate(l): fixed_obs = np.array([ls[j], ws[i]]) z[j, i] = np.argmax(target_net.predict_on_batch(fixed_obs[None, :]).numpy().flatten()) fig, ax = plt.subplots(nrows=1, ncols=2) im = ax[0].pcolor(ww, ll, p) cdict = { 'red': ((0.0, 0.25, .25), (0.02, .59, .59), (1., 1., 1.)), 'green': ((0.0, 0.0, 0.0), (0.02, .45, .45), (1., .97, .97)), 'blue': ((0.0, 1.0, 1.0), (0.02, .75, .75), (1., 0.45, 0.45)) } cm = m.colors.LinearSegmentedColormap('my_colormap', cdict, 1024) im = ax[0].pcolor(ww, ll, z, cmap=cm) fig.colorbar(im) fig.show() return target_net