def run(self): actions_bins = np.array([0, 0.1, 0.5, 1]) n_actions = len(actions_bins) c_env = CollectionsEnv() env = DiscretizedActionWrapper(c_env, actions_bins) n_episodes = 1000 warmup_episodes = 300 epsilon = 1.0 epsilon_final = 0.05 eps_drop = (epsilon - epsilon_final) / warmup_episodes total_rewards = np.empty(n_episodes) current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = 'logs/dqn/' + current_time summary_writer = tf.summary.create_file_writer(log_dir) for i in range(n_episodes): state = env.reset() done = False score = 0 while not done: action, q_value = self.get_action(state, epsilon) next_state, reward, done, info = env.step(action) self.append_sample(state, action, reward, next_state, done) score += reward state = next_state if len(self.memory) > self.batch_size: self.update() if i % 20 == 0: self.update_target() if epsilon > epsilon_final: epsilon = max(epsilon_final, epsilon - eps_drop) total_rewards[i] = score avg_rewards = total_rewards[max(0, i - 100):(i + 1)].mean() with summary_writer.as_default(): tf.summary.scalar('episode reward', score, step=i) tf.summary.scalar('running avg reward(100)', avg_rewards, step=i) if i % 10 == 0: print("episode:", i, "episode reward:", score, "eps:", epsilon, "avg reward (last 100):", avg_rewards)
def setup_experiment(conf, name, extype='dqn', use_portfolio=False, experiment_name=None, seed=1): if use_portfolio: n_acc = 50 portfolio = generate_portfolio(n_acc, seed=seed) else: portfolio = None params = Parameters() params.rho = 0.15 actions_bins = np.array([0., 0.2, 0.5, 0.7, 1.0, 1.5]) n_actions = len(actions_bins) # rep_dist = BetaRepayment(params, 0.9, 0.5, 10, MAX_ACCOUNT_BALANCE) rep_dist = UniformRepayment(params) c_env = CollectionsEnv(params=params, repayment_dist=rep_dist, reward_shaping='continuous', randomize_start=True, max_lambda=None, starting_state=np.array([3, 200], dtype=np.float32)) environment = DiscretizedActionWrapper(c_env, actions_bins) if extype == 'dqn': dqn = DQNAgent(environment, name, training=True, config=conf, initialize=False, portfolio=portfolio, experiment_name=experiment_name) elif extype == 'bspline': dqn = DQNAgentPoly(environment, name, training=True, config=conf, portfolio=portfolio, experiment_name=experiment_name) elif extype == 'dqnpenal': dqn = DQNAgentPenalized(environment, name, config=conf, training=True, portfolio=portfolio, experiment_name=experiment_name) else: raise ValueError('Unsupported experiment type.') return dqn
tf.summary.scalar('stddev', stddev) tf.summary.histogram('histogram', var) def save(self): #current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") path_model = os.path.join(self.model_dir, 'main_net.h5') path_actions = os.path.join(self.model_dir, 'action_bins.npy') path_memory_buffer = os.path.join(self.model_dir, 'buffer.pkl') self.main_net.save(path_model) self.memory.save(path_memory_buffer) np.save(path_actions, self.env.action_bins) def load(self, model_path): # self.main_net = tf.keras.models.load_model(os.path.join(model_path, 'main_net.h5')) # self.target_net = tf.keras.models.load_model(os.path.join(model_path, 'main_net.h5')) self.main_net.load_weights(os.path.join(model_path, 'main_net.h5')) self.target_net.load_weights(os.path.join(model_path, 'main_net.h5')) self.action_bins = np.load(os.path.join(model_path, 'action_bins.npy')) if __name__ == '__main__': actions_bins = np.array([0, 1.0]) n_actions = len(actions_bins) c_env = CollectionsEnv(reward_shaping='continuous', randomize_start=True, max_lambda=None) environment = DiscretizedActionWrapper(c_env, actions_bins) # environment = StateNormalization(environment) dqn = DQNAgentLattice(environment, 'DDQNLattice', training=True, config=DefaultConfig(), initialize=False) dqn.run()
from learning.collections_env import CollectionsEnv from learning.utils.wrappers import StateNormalization, DiscretizedActionWrapper import numpy as np from copy import deepcopy action_bins = np.array([0, 1.0]) environment = CollectionsEnv(reward_shaping=False) environment = DiscretizedActionWrapper(environment, action_bins) # environment = StateNormalization(environment) print(f"Reseting: {environment.reset()}") print(f"Step: {environment.step(0)}") print(f"Reseting: {environment.reset()}") environment.env.starting_state = 2 #np.array([10, 100]) print(f"Setting start: {environment.env.starting_state}") print(f"Reseting: {environment.reset()}") # # class ImuT: # def __init__(self, a): # self.a = np.array([0]) # self.starting_a = self.a.copy() # # def reset(self): # self.a = self.starting_a.copy() # return self.a # # def step(self):