def load_agent(dim_state,num_action,step): # Load configuration. Cfg_filename = CFG_PATH + 'Cfg_' + TRAINING_DATE +'.json' with open(Cfg_filename,'r') as cfg: agent_cfg = json.load(cfg)[2] # Delete "epsilon greedy" for validation. agent_cfg['epsilon']=0 # Create agent agent = DQNAgent(dim_state, num_action, agent_cfg) weights_agent = agent.critic.get_weights() num_weights = len(weights_agent) # Load weights Train_folder = CFG_PATH + TRAINING_DATE +'/' list_w = [] for k in range(num_weights): W = np.loadtxt(Train_folder + 'Weight_' + str(k) + '_Step_' + str(step) +'.dat') list_w.append(W) agent.critic.set_weights(list_w) return agent
def load_agent(filename, path_training, dim_state, num_action, step): # Load configuration. with open(filename, 'r') as cfg: agent_cfg = json.load(cfg)[2] # Delete "epsilon greedy" for validation. agent_cfg['epsilon'] = 0 # Create agent agent = DQNAgent(dim_state, num_action, agent_cfg) weights_agent = agent.critic.get_weights() num_weights = len(weights_agent) # Load weights list_w = [] for k in range(num_weights): W = np.loadtxt(path_training + 'Weight_' + str(k) + '_Step_' + str(step) + '.dat') list_w.append(W) agent.critic.set_weights(list_w) return agent
# Instantiating the agent memory_size = 3000 state_size = len(state) gamma = 0.96 epsilon_min = 0.01 batch_size = 64 action_size = len(SpreadTrading._actions) train_interval = 10 learning_rate = 0.001 if not os.path.isfile("./model." + market + ".h5"): agent = DQNAgent(state_size=state_size, action_size=action_size, memory_size=memory_size, episodes=episodes, episode_length=episode_length, train_interval=train_interval, gamma=gamma, learning_rate=learning_rate, batch_size=batch_size, epsilon_min=epsilon_min) # Warming up the agent for _ in range(memory_size): action = agent.act(state) next_state, reward, done, _ = environment.step(action) agent.observe(state, action, reward, next_state, done, warming_up=True) # Training the agent for ep in range(episodes): state = environment.reset() rew = np.float64(0) for _ in range(episode_length):
# Instantiating the agent memory_size = 3000 state_size = len(state) gamma = 0.96 epsilon_min = 0.01 batch_size = 64 action_size = len(environment._actions) train_interval = 10 learning_rate = 0.001 agent = DQNAgent(state_size=state_size, action_size=action_size, memory_size=memory_size, episodes=episodes, episode_length=episode_length, train_interval=train_interval, gamma=gamma, learning_rate=learning_rate, batch_size=batch_size, epsilon_min=epsilon_min) agent.brain = model done = False actions = ['buy', 'hold', 'sell'] while not done: action = agent.act(state) state, _, done, info = environment.step(action) if 'status' in info and info['status'] == 'Closed plot': done = True balance_X = api.balance()[market[3:]]
# 4. Configuration of agent in DQL. ------------------------------------ max_memory = 500 gamma = 0 per_step_eps = 0.5 epsilon = 1 epsilon_min = 0.1 epsilon_decay = epsilon_min ** (1/(per_step_eps*num_episodes*num_steps)) learning_rate = 0.0001 agent_cfg = {'max_memory':max_memory, 'gamma':gamma, 'epsilon': epsilon, 'epsilon_min':epsilon_min, 'epsilon_decay':epsilon_decay, 'learning_rate':learning_rate} # Initialization of Agent agent = DQNAgent(dim_state, num_action, agent_cfg) # ---------------------------------------------------------------------- # 5. Reward function Parameters ---------------------------------------- psd_bands = [4, 10, 20, 100, 200] # [Hz]. Frequency range for Power calculation. (verificar que no cambia usar np.array) plv_bands = [[1, 19], [20, 200]] # [Hz]. Frequency bands of interest for PLV computation. method = 'LfPlvAmpExp' constants = [1, -50, -3, -0.25] reward_cfg = {'fs':fs, 'psd_bands':psd_bands, 'plv_bands':plv_bands,'constants':constants, 'method':method} # Save all configurations save_configurations(CFG_FILENAME, bgtc_cfg, algorithm_cfg, agent_cfg, agent.critic, reward_cfg) # ---------------------------------------------------------------------- # ----------------------------------------------------------------------
import matplotlib.pyplot as plt env = UnityEnvironment(file_name="Banana_Windows_x86_64/Banana.exe") brain_name = env.brain_names[0] brain = env.brains[brain_name] observation_state_size = brain.vector_observation_space_size action_space_size = brain.vector_action_space_size epsilon = 1.0 eps_decay = 0.99 eps_min = 0.001 gamma = 0.99 training_interval = 4 from dqnagent import DQNAgent agent = DQNAgent(observation_state_size, action_space_size) scores = [] last_hundred_scores = deque(maxlen=100) for episode in range(0, 1000): env_info = env.reset(train_mode=True)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score epsilon = max(epsilon * eps_decay, eps_min) t = 0 while (True): t += 1 action = agent.select_action(state, epsilon) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state
env = UnityEnvironment(file_name="Banana_Windows_x86_64/Banana.exe") brain_name = env.brain_names[0] brain = env.brains[brain_name] observation_state_size = brain.vector_observation_space_size action_space_size = brain.vector_action_space_size epsilon = 0 eps_decay = 0.99 eps_min = 0.001 gamma = 0.99 training_interval = 4 from dqnagent import DQNAgent agent = DQNAgent(observation_state_size, action_space_size) agent.network1.load_state_dict(torch.load('checkpoint4.pth')) for episode in range(0, 2): env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while (True): action = agent.select_action(state, 0) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step
dqn_online.to(device) dqn_target.to(device) # optimizer = torch.optim.RMSprop(dqn_online.parameters(), lr=LR, momentum=0.95, eps=0.01) # paper used rmsprop optimizer = torch.optim.Adam(dqn_online.parameters(), lr=LR) if CKPT_ENABLED and os.path.exists(CKPT_FILENAME): progress = load_checkpoint(dqn_online, dqn_target, optimizer, CKPT_FILENAME) else: progress = [] dqn_target.eval() mem_buffer = ReplayMemory(MEMORY_SIZE, STATE_SHAPE) loss_fn = torch.nn.SmoothL1Loss() # huber loss function agent = DQNAgent(device, mem_buffer, dqn_online, dqn_target, optimizer, loss_fn, GAMMA, BATCH_SIZE, UPDATE_ONLINE_INTERVAL, UPDATE_TARGET_INTERVAL) # training phase # adjust these hyperparameters as necessary num_episodes = 5000 # number of episodes to train for explore_phase_length = 50000 # number of steps without any exploitation (paper used 50k) epsilon = 1.0 # initial epsilon value (paper used 1.0) epsilon_decrement_steps = 1000000 # how many steps to decrement epsilon to min value (paper used 1 million) intermediate_epsilon = 0.1 # can be used to decay epsilon in two phases as recommended by openai (set equal to min_epsilon to disable) min_epsilon = 0.01 # smallest possible value of epsilon (paper used 0.1 for dqn, 0.01 for ddqn) epsilon_dec = (epsilon - intermediate_epsilon) / epsilon_decrement_steps final_epsilon_decay = (intermediate_epsilon - min_epsilon) / epsilon_decrement_steps
import os sys.path.append(os.path.join(os.path.dirname(__file__), "..", "lib")) import tictactoe.gym as gym import tictactoe.agent as agent import tictactoe.test_utils as utils import tensorflow as tf from tictactoe_dqn import TicTacToeDQN from dqnagent import DQNAgent env = gym.getEnv() dqn1 = TicTacToeDQN() dqn2 = TicTacToeDQN() agent1 = DQNAgent(1, dqn1) agent2 = DQNAgent(-1, dqn2) sess = tf.Session() sess.run(tf.global_variables_initializer()) dqn1.set_session(sess) dqn2.set_session(sess) dqn1.load("./models/dqn1.ckpt") dqn2.load("./models/dqn2.ckpt") dual = agent.DualAgent(agent1, agent2) print(utils.play(env, dual, render=True)) print(utils.test_player1(env, agent1)) print(utils.test_player2(env, agent2))
def main(_): flags = tf.app.flags.FLAGS # set up an environment env = gym.make(config.environment) if config.record: env = gym.wrappers.Monitor(env, config.record_path, force=True) # set up an agent if config.continuous_action: agent = A3CAgent(env, PiVNetwork, history_length=1, log_dir=config.log_dir) else: agent = DQNAgent(env, QNetwork, minibatch_size_limit=32, replay_memory_size=1000000, history_length=1, target_update_step=200, discount_factor=0.99, learning_rate=0.0025, initial_exploration=1.0, final_exploration=0.01, final_exploration_frame=10000, replay_start_size=100, log_dir=config.log_dir) print('Observation space: ', env.observation_space, 'Action space: ', env.action_space) if config.restore_model_path: agent.restore_variables(config.restore_model_path) total_frames = 0 # training for episode in range(1, config.max_episodes + 1): terminal = False total_reward = 0 frames = 0 while not terminal: if config.train: a, s, r_t, terminal, info = agent.act_and_train() else: a, s, r_t, terminal, info = agent.act() if config.render: env.render() total_reward += r_t frames += 1 total_frames += 1 if episode % config.interval_to_save_model == 0: agent.save_variables(episode, config.save_model_dir) #print('Episode: ', episode, ' Frames: ', total_frames, ' R: ', total_reward, ' Epsilon: ', info['epsilon']) print('Episode: ', episode, ' Frames: ', total_frames, ' R: ', total_reward) agent.write_summary(episode, total_reward) agent.new_episode()
def train(): with tf.Session() as sess: DQNbrain = DQNAgent( sess, OUTPUT, INPUT, learning_rate=LEARNING_RATE, gamma=DISCOUNT, batch_size=MINIBATCH_SIZE, buffer_size=MEMORY_SIZE, target_update_step=TARGET_UPDATE, e_greedy=not LOAD_model, e_step=1000, gradient_norm=None, ) if LOAD_model: DQNbrain.load_model(tf.train.latest_checkpoint(path)) else: sess.run(tf.global_variables_initializer()) all_rewards = [] frame_rewards = [] loss_list = [] loss_frame = [] recent_rlist = deque(maxlen=15) recent_rlist.append(0) episode, epoch, frame = 0, 0, 0 start = timer() env.env.masspole = 0.05 env.env.length = 2. #env.env.force_mag = 10. while np.mean(recent_rlist) < 499: episode += 1 rall, count = 0, 0 done = False s = env.reset() while not done: if RENDER: env.render() frame += 1 count += 1 action, actions_value = DQNbrain.choose_action(s) s_, r, done, l = env.step(action) if done and count >= 500: reward = 1 elif done and count < 500: reward = -10 else: reward = 0 DQNbrain.memory_add(s, float(action), reward, s_, int(done)) s = s_ rall += r if frame > TRAIN_START and TRAIN: loss = DQNbrain.learn() loss_list.append(loss) loss_frame.append(frame) recent_rlist.append(rall) all_rewards.append(rall) frame_rewards.append(frame) print( "Episode:{} | Frames:{} | Reward:{} | Recent reward:{}".format( episode, frame, rall, np.mean(recent_rlist))) if os.path.isdir(path): shutil.rmtree(path) os.mkdir(path) ckpt_path = os.path.join(path, 'DQN.ckpt') if SAVE_model: DQNbrain.save_model(ckpt_path) plt.figure(figsize=(10, 8)) plt.subplot(211) plt.title('Episode %s. Recent_reward: %s. Time: %s' % (len(all_rewards), np.mean(recent_rlist), timedelta(seconds=int(timer() - start)))) plt.plot(frame_rewards, all_rewards) plt.ylim(0, 510) plt.subplot(212) plt.title('Loss') plt.plot(loss_frame, loss_list) #plt.ylim(0, 20) plt.show() plt.close()
def test(): with tf.Session() as sess: DQNbrain = DQNAgent( sess, OUTPUT, INPUT, learning_rate=LEARNING_RATE, gamma=DISCOUNT, batch_size=MINIBATCH_SIZE, buffer_size=MEMORY_SIZE, target_update_step=TARGET_UPDATE, e_greedy=not LOAD_model, e_step=1000, gradient_norm=None, ) DQNbrain.load_model(tf.train.latest_checkpoint(path)) masspole_list = np.arange(0.01, 0.21, 0.025) length_list = np.arange(0.5, 3, 0.25) performance_mtx = np.zeros( [masspole_list.shape[0], length_list.shape[0]]) for im in range(masspole_list.shape[0]): for il in range(length_list.shape[0]): env.env.masspole = masspole_list[im] env.env.length = length_list[il] all_rewards = [] for episode in range(5): rall, count = 0, 0 done = False s = env.reset() while not done: if RENDER: env.render() action, actions_value = DQNbrain.choose_action(s) s_, r, done, _ = env.step(action) s = s_ rall += r all_rewards.append(rall) print("Episode:{} | Reward:{} ".format(episode, rall)) performance_mtx[im, il] = np.mean(all_rewards) fig, ax = plt.subplots() ims = ax.imshow(performance_mtx, cmap=cm.gray, interpolation=None, vmin=0, vmax=500) ax.set_xticks( np.arange(0, length_list.shape[0], length_list.shape[0] - 1)) ax.set_xticklabels(['0.5', '3']) ax.set_yticks( np.arange(0, masspole_list.shape[0], masspole_list.shape[0] - 1)) ax.set_yticklabels(['0.01', '0.20']) ax.set_xlabel('Pole length') ax.set_ylabel('Pole mass') ax.set_title('Robustness test: DQN') fig.colorbar(ims, ax=ax) plt.show() plt.close()
configs = [ pong_dddqn_config0, pong_dddqn_config1, pong_dddqn_config2, pong_dddqn_config3, pong_dddqn_config4, pong_dddqn_config5, pong_dddqn_config6, pong_dddqn_config7 ] env = make_atari_deepmind(env_name, skip=4) import argparse parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('index', metavar='N', type=int, nargs='+', help='an index') args = parser.parse_args() print('selected index:', args.index) print(configs[args.index[0]]) agent = DQNAgent(env, sess, env_name, config=configs[args.index[0]]) env.reset() agent.train() import wrappers #print(env.unwrapped.get_action_meanings()) def evaluate(env, t_max=10000): rewards = [] env._max_episode_steps = 9999 print('reset') s = env.reset() reward = 0 for it in range(t_max): qvalues = agent.get_qvalues([s])