def main_DQN(): env = AgarioEnv(render=RENDER, speed_scale=SPEED_SCALE, display_text=DISPLAY_TEXT, grid_resolution=GRID_RESOLUTION) agent = DQNAgent(height=GRID_RESOLUTION, width=GRID_RESOLUTION, input_channels=2, num_actions=ACTION_DISCRETIZATION, loadpath='') # env.seed(41) # agent.seed(41) for episode in range(NUM_EPISODES): state = env.reset() done = False new_state = None reward = 0 num_steps = 0 while not done: raw_action = agent.get_action(state) action = agent.action_to_angle(raw_action) for _ in range(NUM_SKIP_FRAMES): if RENDER: env.render() new_state, reward, done, _ = env.step(action) num_steps += 1 # print(f'step = {num_steps}') if done or num_steps > MAX_STEPS: new_state = None done = True agent.memory.push(state, raw_action, new_state, reward) agent.optimize() if done: print(f'Episode {episode} done, max_mass = {state.mass}') agent.max_masses.append(state.mass) agent.print_final_stats() if num_steps % agent.TARGET_UPDATE == 0: # print(f'UPDATING TARGET') agent.target_net.load_state_dict(agent.policy_net.state_dict()) state = new_state if episode % WEIGHTS_SAVE_EPISODE_STEP == 0: torch.save( agent.policy_net.state_dict(), f'DQN_weights/model_{episode}_{str(datetime.now()).replace(" ", "_")}_episodes.model' ) np.savetxt( f'DQN_weights/model_{episode}_{str(datetime.now()).replace(" ", "_")}_episodes.performance', np.array(agent.max_masses)) print(f'Complete') torch.save( agent.policy_net.state_dict(), f'model_{NUM_EPISODES}_{str(datetime.now()).replace(" ", "_")}_episodes.model' ) np.savetxt( f'DQN_weights/model_{NUM_EPISODES}_{str(datetime.now()).replace(" ", "_")}_episodes.performance', np.array(agent.max_masses)) agent.print_final_stats() env.close()
def setup(config: str, load_file: str) -> DQNAgent: if config: loader = AgentLoader(config, num_actions=num_actions, num_inputs=num_inputs) agent = loader.load() else: agent = DQNAgent(num_actions=num_actions, num_inputs=num_inputs) if load_file: print(f'Loading "{load_file}"...') agent.load(load_file) return agent
def start_training_dqn(is_prioritized): if is_prioritized: prio = "with_priority" else: prio = "no_priority" env = gym.make(hyperparams['environment']) state_spec = len(env.observation_space.sample()) action_spec = env.action_space.n log_name = 'final_build' + prio log_dir = 'logs/acrobot/' + log_name log_writer = tf.summary.create_file_writer(log_dir) epsilon = hyperparams['epsilon'] buffer = PrioritizedReplay( hyperparams['max_experiences']) if is_prioritized else UniformReplay( hyperparams['max_experiences']) agent = DQNAgent(hyperparams['hidden_layer_dqn'], state_spec, action_spec, buffer, hyperparams['learning_rate_dqn'], is_prioritized) total_rewards = np.empty(hyperparams['episodes']) for episode in range(hyperparams['episodes']): episode_reward = 0 epsilon = max(hyperparams['min_epsilon'], epsilon * hyperparams['decay']) done = False state = env.reset() while not done: action = agent.play_action(state, epsilon) next_state, reward, done, _ = env.step(action) episode_reward += reward buffer.add((state, action, reward, next_state, done)) state = next_state if len(buffer.experiences) > hyperparams['min_experiences']: agent.train(hyperparams['gamma'], hyperparams['batch_size']) total_rewards[episode] = episode_reward avg_rewards = total_rewards[max(0, episode - 20):(episode + 1)].mean() env.reset() with log_writer.as_default(): tf.summary.scalar('episode reward', episode_reward, step=episode) tf.summary.scalar('avg for 20 episodes', avg_rewards, step=episode) agent.network.save_weights('dqn_{}_network.h5'.format(prio)) env.close()
def print_progress(agent: DQNAgent, data: dict): percent = data['percent'] progress = '=' * int(percent) progress += '>' left = ' ' * (100 - percent) progress = f'{percent}% [{progress + left}]' reward, steps = data['stats'] mean = round(reward.mean(), 1) std = round(reward.std(), 1) positive = reward[reward > 0].size total = reward.size steps = steps.sum() losses = data['losses'] if total > 50: graph(reward, verbose=True) plt.savefig(f'figures/{run_id}_training.png') if len(losses) > 10: graph(losses.detach().numpy(), xlabel='Replays', ylabel='Loss', window=5) plt.savefig(f'figures/{run_id}_losses.png') # print(progress + f' μ: {mean}, σ: {std}; +{positive}/{total}, steps: {steps}', end='\r') # if percent % 5 != 0: # return last100 = reward[-100:] last_mean = round(last100.mean(), 2) last_std = round(last100.std(), 1) verbose = data['verbose'] if percent % 2 == 0 and last_mean > 200: print(' ' * 100, end='\r') if verbose: print('Last 100 episodes average over 200! ', end='') agent.save(f'{run_id}_{percent}p', str(round(last_mean, 0))) # rar = f'rar: {round(data["rar"], 5)}' if verbose else '' # Spaces at the end are to clean up the progress bar print(f'Total mean: {mean}, std: {std}; ' f'Last 100 mean: {last_mean}, std: {last_std}; ' f'Positive: {positive}/{total} ' f'Steps: {steps} ', # rar, " " * 20) if verbose: if len(losses) > 1: mean = round(losses.mean().item(), 3) std = round(torch.std(losses).item(), 3) print(f'Recent Losses: {losses[-5:]}, mean: {mean}, std: {std}') print(progress, end='\r')
def main(num_episodes, render=False): # initialize gym environment and the agent # env = gym.make('SpaceInvaders-v0') env = gym.make('Breakout-v0') state = env.reset() state_shape = list(state.shape) state_shape[-1] = state_shape[-1] * 5 agent = DQNAgent(state_shape, env.action_space.n) states = deque(maxlen=5) max_train_time = 800 # Iterate the game for e in range(num_episodes): # reset state in the beginning of each game state = env.reset() for i in range(5): states.appendleft(state) # time_t represents each frame of the game num_random = 0 total_reward = 0. for time_t in range(max_train_time): # turn this on if you want to render if render: env.render() # Decide action action = agent.act(states) if agent.acted_randomly: num_random += 1 # Advance the game to the next frame based on the action. next_state, reward, done, info = env.step(action) total_reward += reward # Remember the previous state, action, reward, and done agent.remember(states.copy(), action, reward, next_state, done) # make next_state the new current state for the next frame. states.appendleft(next_state) # done becomes True when the game ends if done: # print the score and break out of the loop rand_perc = num_random / float( time_t + 1) * 100. # Percentage of random actions. print( "episode: {}/{}, training_time: {}, summed_reward: {}, random_actions: {}%, eps: {}" .format(e, num_episodes, time_t, total_reward, rand_perc, agent.epsilon)) # train the agent with the experience of the episode agent.replay(min(100, time_t)) break # print("epsilon {}".format(agent.epsilon)) if e % 1000 == 0: agent.save("./deep_q_model.h5") print("saved model")
def __init__(self): """Player implementation of dqn and random agents""" self.env = UnityEnvironment( file_name="../env/Banana_Linux_NoVis/Banana.x86_64") self.brain_name = self.env.brain_names[0] brain = self.env.brains[self.brain_name] # reset the environment env_info = self.env.reset(train_mode=False)[self.brain_name] # number of actions self.action_size = brain.vector_action_space_size # examine the state space state = env_info.vector_observations[0] state_size = len(state) self.agent = DQNAgent(state_size, self.action_size, seed=0) self.agent.local_network.load_state_dict( torch.load('../saved_models/dqn_banana_best.pth'))
def define_agent(self, width, height, num_actions): return DQNAgent(config=Config(num_actions=num_actions, encoder=OneHotEncoder(width, height), optimizer=AdamOptimizer(0.01), network=MLP(), policy=EpsilonGreedyPolicy(1, 0.01, 500), discount=0.95, capacity=100, batch_size=16))
def run_exp(cfg=None): logger = Logger(cfg) agent = DQNAgent(cfg) env = Env(cfg) trainer = Trainer(env, agent, cfg) cfg = cfg.exp n_training_steps = cfg.n_episodes // cfg.train_after global_step = 0 state = env.reset() joint_angles = np.empty(cfg.n_episodes) for step in range(cfg.n_episodes): state = trainer.single_step(state) # agent training if global_step % cfg.train_after == (cfg.train_after - 1): print(f"step: {step}") print("Training agents") # fw model warmup phase of 2000 steps metrics_dict = agent.train( cfg.train_iv, cfg.train_fw, cfg.train_policy if global_step >= 0 else False) logger.log_metrics(metrics_dict, global_step) logger.log_all_network_weights(agent.joint_agents[0], step) agent.decrease_eps(n_training_steps) # video logging if global_step % cfg.video_after == 0: print("logging video") vis, debug0, debug1 = trainer.record_frames(debug_cams=True) logger.log_vid_debug_cams(vis, debug0, debug1, global_step) # distractor toggling if global_step % cfg.toggle_table_after == (cfg.toggle_table_after - 1): env.toggle_table() global_step += 1 pos = env.get_joint_positions()[0] joint_angles[step] = pos joint_angles = np.degrees(-joint_angles) plt.hist(joint_angles, bins=20, range=(0, 170)) plt.savefig(os.path.join("plots", "explored_angles.png"))
def on_progress(self, agent: DQNAgent, data): """ After 1% of the total iterations is complete, the agent will call this function This is an opportunity to decide if it is time to quit early. """ percent: int = data['percent'] reward, steps = data['stats'] rar = data['rar'] if len(reward) >= 100: last100 = reward[-100:] mean = np.round(last100.mean()) if mean >= 200: print("Successfully completed goal") self.success = True self.exit_early = True agent.end_training_early() elif mean >= 50 and percent % 5 == 0: print("\nGood performance found, saving checkpoint") epoch = int(self.episodes * percent / 100) agent.save(f'{self.id}', f'{epoch}_{mean}') if self.verbose and percent % 10 == 0: # TODO: Print additional info print(f"\n{percent}% " f"\tTotal reward={round(reward.mean(), 3)} " f"steps={steps.sum()} " f"rar={round(rar, 3)}") # look at the last several episodes reward = reward[-self.percent_size:] print(f"\t\tRecent reward={round(reward.mean(), 3)}, " f"max={round(reward.max(), 3)}") if self.verbose: print(f'{percent}% ... ', end="") else: progress = '=' * int(percent) progress += '>' left = ' ' * (100 - percent) print(f'{percent}% [{progress + left}]', end='\r')
def define_agent(self, width, height, num_actions): return DQNAgent( config=Config( num_actions=num_actions, encoder=LayerEncoder(width, height, treasure_position=True), optimizer=AdamOptimizer(0.001), network=CNN(hidden_units=[128]), policy=EpsilonGreedyPolicy(1, 0.01, 50000), discount=0.95, capacity=10000, batch_size=8, target_sync=100, double_q=True))
def test_model(model, is_ac): env = gym.make(hyperparams['environment']) state_spec = len(env.observation_space.sample()) action_spec = env.action_space.n buffer = None is_prioritized = False if is_ac: agent = ActorCriticAgent(hyperparams['hidden_layer_actor'], hyperparams['hidden_layer_critic'], state_spec, action_spec, hyperparams['learning_rate_actor'], hyperparams['learning_rate_critic']) agent.actor_network.load_weights(model) else: agent = DQNAgent(hyperparams['hidden_layer_dqn'], state_spec, action_spec, buffer, hyperparams['learning_rate_dqn'], is_prioritized) agent.network.load_weights(model) obs = env.reset() env.render() # Play 20 episodes for i in range(20): rewards = [] while True: if is_ac: action = agent.play_action(obs) else: action = agent.play_action(obs, hyperparams['min_epsilon']) obs, reward, done, _ = env.step(action) env.render() rewards.append(reward) if done: print("Gathered {} reward".format(np.sum(rewards))) env.reset() break env.close()
batch_size=64, num_batches=20, starts_learning=5000, discount=0.99, target_freq=10, verbose=True, print_every=10) ''' agent = DQNAgent(action_set=[0, 1, 2], reward_function=mountain_car_reward_function, feature_extractor=MountainCarIdentityFeature(), hidden_dims=[50, 50], learning_rate=5e-4, buffer_size=50000, batch_size=64, num_batches=100, starts_learning=5000, final_epsilon=0.02, discount=0.99, target_freq=10, verbose=True, print_every=10) _, _, rewards = live(agent=agent, environment=env, num_episodes=episodes, max_timesteps=200, verbose=True, print_every=50) np.save(os.path.join(reward_path, file_name), rewards)
class TestAgent(object): def __init__(self, shape, n_actions): self.n_actions = n_actions self.db = ReplayDB(shape, 100) def select_action(self, obs): return np.random.choice(self.n_actions) def update(self, s, a, r, t): self.db.insert(s, a, r, t) def create_mlp(inputs, n_out): net = nn.input_data(placeholder=inputs) net = nn.fully_connected(net, 25, activation='relu') net = nn.dropout(net, 0.4) net = nn.fully_connected(net, 25) net = nn.dropout(net, 0.4) net = nn.fully_connected(net, n_out, activation='linear') return net if __name__ == "__main__": env = gym.make('MountainCar-v0') n_actions = env.action_space.n agent = DQNAgent(create_mlp, n_actions, env.observation_space.shape, min_replay_size=10000, batch_size=64) exp = Experiment(agent, env) exp.run_epoch(1000000) print agent.db.num_samples() print agent.db.sample(10)
def main_DQN_plus_greedy(): GREEDY_TOTAL_NUM_EPISODES = 1000 GREEDY_NUM_EPISODES = GREEDY_TOTAL_NUM_EPISODES // 3 env = AgarioEnv(render=RENDER, speed_scale=SPEED_SCALE, display_text=DISPLAY_TEXT, grid_resolution=GRID_RESOLUTION) agent = DQNAgent(height=GRID_RESOLUTION, width=GRID_RESOLUTION, input_channels=2, num_actions=ACTION_DISCRETIZATION, loadpath='') greedy = Greedy() env.seed(41) agent.seed(41) for episode in range(GREEDY_TOTAL_NUM_EPISODES): state = env.reset() done = False new_state = None raw_action, action = None, None reward = 0 num_steps = 0 is_greedy_episode = episode < GREEDY_NUM_EPISODES while not done: if is_greedy_episode: action = greedy.get_action(state) raw_action = agent.angle_to_action(action) # print(f'angle: {action}, raw_action: {raw_action}') else: raw_action = agent.get_action(state) action = agent.action_to_angle(raw_action) for _ in range(NUM_SKIP_FRAMES): if RENDER: env.render() new_state, reward, done, _ = env.step(action) num_steps += 1 # print(f'step = {num_steps}') if done or num_steps > MAX_STEPS: new_state = None done = True agent.memory.push(state, raw_action, new_state, reward) agent.optimize() if done: print( f'{"Greedy" if is_greedy_episode else "DQN" } episode done, max_mass: {state.mass}' ) if not is_greedy_episode: agent.max_masses.append(state.mass) if num_steps % agent.TARGET_UPDATE == 0: # print(f'UPDATING TARGET') agent.target_net.load_state_dict(agent.policy_net.state_dict()) state = new_state print(f'Complete') torch.save( agent.policy_net.state_dict(), f'model_GREEDY_DQN_{NUM_EPISODES}_{str(datetime.now()).replace(" ", "_")}_episodes.model' ) agent.print_final_stats() env.close()
def create_agent(self, config): from agents import DQNAgent agent = DQNAgent(num_actions=self.num_actions, num_inputs=self.num_inputs, config=config, **config) self.agent = agent self.current_config = config return agent
action = np.random.randint(4) for i in range(self.action_repeat): reward = self.environment.act(action) total_score += reward self.environment.update_screen() return total_score sess = tf.InteractiveSession() counter = Counter(7000000) replay_memory = ReplayMemory(1000000) dqn_agent = DQNAgent((84,84,4), NATURE, 4, replay_memory, counter, tf_session=sess) agent = EpsilonAgent(dqn_agent, 4, counter) agi = AtariGameInterface('Breakout.bin', agent, replay_memory, counter) # Create a Tensorboard monitor and populate with the desired summaries tensorboard_monitor = TensorboardMonitor('./log', sess, counter) tensorboard_monitor.add_scalar_summary('score', 'per_game_summary') tensorboard_monitor.add_scalar_summary('training_loss', 'training_summary') for i in range(4): tensorboard_monitor.add_histogram_summary('Q%d_training' % i, 'training_summary') checkpoint_monitor = CheckpointRecorder(dqn_agent.dqn, replay_memory, counter, './checkpoints', sess) agi.add_listener(checkpoint_monitor) agi.add_listener(tensorboard_monitor) dqn_agent.add_listener(tensorboard_monitor)
def train(classifier): lg = global_logger["lg"] if opt.agent == 'policy': agent = PolicyAgent() elif opt.agent == 'dqn': agent = DQNAgent() elif opt.agent == 'dqn_target': agent = DQNTargetAgent() elif opt.agent == 'actor_critic': agent = ActorCriticAgent() elif opt.agent == 'random': agent = RandomAgent() else: agent = DQNAgent() start_episode = 0 # load old model file_name = opt.load_model_name if file_name != "": old_model = load_external_model(file_name) start_episode = int(file_name.split('/')[1]) agent.load_policynetwork(old_model) game = Game() model = classifier() for episode in range(start_episode, opt.episodes): model.reset() game.reboot(model) print('##>>>>>>> Episode {} of {} <<<<<<<<<##'.format(episode, opt.episodes)) terminal = False num_of_zero = 0 state = game.get_state(model) first_log = True cum_reward = 0 while not terminal: action = agent.get_action(state) reward, next_state, terminal = game.feedback(action, model) if not terminal: agent.update(state, action, reward, next_state, terminal) cum_reward += reward if (action == 1): print("> State {:2} Action {:2} - reward {:.4f} - performance {:.4f}".format(game.current_state, action, reward, game.performance)) # print(state) step = 0 if first_log else game.queried_times timer(lg.scalar_summary, ("last_episode_performance", game.performance, step)) first_log = False else: num_of_zero += 1 del state state = next_state if terminal: agent.finish_episode(episode) break # Reset model model.reset() timer(model.train_model, (data["active"], opt.full_epochs)) metrics = timer(model.performance_validate, (data["dev"],)) lg.dict_scalar_summary('episode-validation', metrics, episode) lg.scalar_summary('episode-cum-reward', cum_reward, episode) lg.scalar_summary('performance', game.performance, episode) lg.scalar_summary('number-of-0-actions', num_of_zero, episode)
import gym import numpy as np from agents import QAgent, Agent, RandomAgent, DQNAgent env = gym.make('LunarLander-v2') num_episodes = 5000 agent = DQNAgent(env.observation_space.n, env.action_space.n) average_reward = [] for episode in range(num_episodes): rewards = [] state = env.reset() while True: action = agent.act(state) next_state, reward, done, info = env.step(action) rewards.append(reward) agent.step(state, action, reward, next_state, done) state = next_state if done: average_reward.append(np.sum(rewards)) break # monitor progress if episode % print_evry == 0: reward_last_100 = int(np.mean(average_reward[-99:])) learning_rate = agent.scheduler.get_lr().squeeze()
def main(): parser = argparse.ArgumentParser(description="-----[Agent tester]-----") parser.add_argument( '--agent', default='dqn', help='Type of reinforcement agent. (dqn | policy, actor_critic)') parser.add_argument('--env', default='CartPole-v0', help='Type of reinforcement env.') params = parser.parse_args() env = gym.make(params.env) env = env.unwrapped opt.actions = env.action_space.n opt.state_size = env.observation_space.shape[0] opt.hidden_size = 8 opt.batch_size_rl = 32 opt.cuda = False opt.reward_clip = True opt.gamma = 0.99 opt.data_sizes = [opt.state_size] opt.learning_rate_rl = 0.01 from agents import DQNAgent, DQNTargetAgent, PolicyAgent, ActorCriticAgent, RandomAgent if params.agent == 'policy': agent = PolicyAgent() elif params.agent == 'dqn': agent = DQNAgent() elif params.agent == 'dqn_target': agent = DQNTargetAgent() elif params.agent == 'actor_critic': agent = ActorCriticAgent() elif params.agent == 'random': agent = RandomAgent() else: agent = DQNAgent() print('\nCollecting experience...') for i_episode in range(4000): state = env.reset() state = torch.FloatTensor(state).view(1, -1) score = 0 done = False while not done: env.render() action = agent.get_action(state) next_state, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = next_state r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 r = r1 + r2 next_state = torch.FloatTensor(next_state).view(1, -1) if not done: agent.update(state, action, r, next_state, done) score += 1 state = next_state if done: agent.finish_episode(i_episode) print('Ep: ', i_episode, '| Ep_r: ', round(score, 2)) break
state = next_state # roll over the state to next time s if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 13.0 or i_episode == n_episodes: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), filename) break return agent, scores agent = DQNAgent(QNetworkDuellingCNN, state_size, action_size, seed=0, ddqn=True) agent, scores = dqn(agent, 'duellingCNN.pth') agent2 = DQNAgent(QNetworkCNN, state_size, action_size, seed=0, ddqn=True) agent2, scores = dqn(agent2, 'CNN.pth')
# filename = './data/' + cur + '_lag_' + str(lag) + '.csv' # df= pd.read_csv(filename).reset_index(drop = True) if __name__ == '__main__': dqn_model_path = './AUDUSD/agents/20190526-174236/dqn|0.pt' np.random.seed(321) torch.manual_seed(123) env = ForexEnv(mode='eval') eps = 23 rewards = [] agent = DQNAgent(action_set=[0, 1, 2], reward_function=functools.partial(Forex_reward_function), feature_extractor=ForexIdentityFeature(), hidden_dims=[10, 10], test_model_path=dqn_model_path) for e in range(eps): observation_history, action_history = test(agent=agent, environment=env, max_timesteps=3600, n=e) r = torch.sum( agent.get_episode_reward(observation_history, action_history)) print('reward %.5f' % r) rewards.append(r) # print(action_history) if e == eps - 1: print(agent.get_episode_reward(observation_history,
print("Double DQN {}, Duelling Architecture {}".format( args.double_dqn, args.duelling)) # instantiate appropriate agent if (args.double_dqn is True) & (args.duelling is True): agent = DDQNAgent(state_size=37, action_size=4, model=DuelingQNetwork, seed=0) agent_name = 'duel_ddqn' elif (args.double_dqn is True) & (args.duelling is False): agent = DDQNAgent(state_size=37, action_size=4, model=QNetwork, seed=0) agent_name = 'ddqn' elif (args.double_dqn is False) & (args.duelling is True): agent = DQNAgent(state_size=37, action_size=4, model=DuelingQNetwork, seed=0) agent_name = 'duel_dqn' else: agent = DQNAgent(state_size=37, action_size=4, model=QNetwork, seed=0) agent_name = 'dqn' # Run simulation with specified agent print('Running simulation with {} agent'.format(agent_name)) run(agent, agent_name) env.close()
# Create a keras Huber loss function def hubert_loss(y_true, y_pred): err = y_pred - y_true return K.mean(K.sqrt(1 + K.square(err)) - 1, axis=-1) ## Instantiate the improved DQN agent ragent = DQNAgent( name='FullDQNAgent-1', state_dim=env.observation_space.shape[0], action_dim=env.action_space.n, epsdecay=0.975, buffersize=500000, samplesize=32, minsamples=1000, gamma=0.99, update_target_freq=600, nnparams={ # Improved DQN setting 'hidden_layers': [(50, 'relu'), (40, 'relu')], 'loss': hubert_loss, 'optimizer': Adam(lr=0.0005), 'target_network': True }) # Create an experiment with the LunarLander env and improved DQN agent for 500 train/test episodes exp = Experiment(env, ragent, logdir="../log", verbose=True, num_episodes=500) # Training trials exp.run(testmode=False) # Test trials
ckpt_dir = Path( "C:/Users/kevin/OneDrive/Dokumente/Coding/reinforcement_learning/models") log_dir = Path( "C:/Users/kevin/OneDrive/Dokumente/Coding/reinforcement_learning/logs") # Tensorboard summar writer for logging writer = tensorboard.SummaryWriter(log_dir=log_dir) # Create DQN Agent agent = DQNAgent(gamma=0.99, epsilon=1, lr=0.0001, input_dims=(env.observation_space.shape), n_actions=env.action_space.n, mem_size=50000, eps_min=0.1, batch_size=32, replace=1000, eps_dec=0.99999, chkpt_dir=ckpt_dir, algo='DQNAgent', env_name='PongNoFrameskip-v4') # load models if already saved if load_checkpoint: agent.load_models() n_steps = 0 scores, eps_history, steps_array = [], [], [] # Play games
# create environment env = UnityEnvironment(file_name='./Banana.app') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] # create DQN agent osize = len(env_info.vector_observations[0]) asize = brain.vector_action_space_size seed = 0 agent = DQNAgent(osize, asize, seed, BUFFERSIZE, GAMMA, EPSILON, DECAY, EPMIN, MINIBATCHSIZE, LEARNRATE, TAU) # log scores reward_log = [] avg_log = [] avg_window = collections.deque(maxlen=AVG_WINDOW) # verbosity VERBOSE = True # Train the agent for ep_count in range(1, MAX_EPISODES): # reset the environment env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0]
import gym from puckworld import PuckWorldEnv from agents import DQNAgent from utils import learning_curve env = PuckWorldEnv() agent = DQNAgent(env) data = agent.learning(gamma=0.99, epsilon=1, decaying_epsilon=True, alpha=1e-3, max_episode_num=100, display=False) learning_curve(data, 2, 1, title="DQNAgent performance on PuckWorld", x_name="episodes", y_name="rewards of episode")
env = gym.make(environment) action_space = env.action_space.n observation_space = env.observation_space.shape # creating own tf session to use across all the Keras/Tensorflow models we are using sess = tf.Session() # Our models to solve the mountaincar problem. agent = DQNAgent(sess, action_space, observation_space, learning_rate=learning_rate, batch_size=batch_size, replay_memory_size=replay_memory_size, minimum_replay_memory=min_replay_memory, epsilon_start=epsilon_start, epsilon_end=min_epsilon, discount=discount, activation=activation, optimizer=optimizer, loss_function=loss_function, dense_1=dense_1, dense_2=dense_2) # replay experience replay_memory = agent.memory # dynamic epsilon if (dynamic_epsilon): average_last_hundred_rewards = np.full(100, -200, dtype=float) reward_array_index = 0
# create environment env = UnityEnvironment(file_name='./Banana.app') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] # create DQN agent osize = len(env_info.vector_observations[0]) asize = brain.vector_action_space_size seed = 0 agent = DQNAgent(osize, asize, seed) # load the weights from file agent.Q.load_state_dict(torch.load('checkpoint.pth')) # simulate smart agent for i in range(NUM_SIMS): # reset the environment env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations[0] for t in range(1, MAX_STEPS_PER_EPISODE): # get action from policy action = agent.get_action(state)
"host": host, "body_style": "donkey", "body_rgb": (128, 128, 128), "car_name": "42AI Potato Qarnot", "font_size": 100, "racer_name": "DDQN", "country": "FR", "bio": "Learning to drive w DDQN RL", "guid": str(uuid.uuid4()), "max_cte": 10, } if __name__ == "__main__": env = gym.make(env_name, conf=config_Simulator) S3 = S3(config.config_NeuralPlayer.config_Datasets.S3_bucket_name) agent = DQNAgent(config=config_Agent, S3=S3) agent.config.epsilon = 0.1 preprocessor = PreprocessingVannilla( config.config_NeuralPlayer.config_Preprocessing) env.reset() i = 0 state, reward, done, infos = env.step([0, 0.1]) while (i < 1000): processed_state = preprocessor.process(state) action = agent.get_action(processed_state) state, reward, done, infos = env.step(action) print(action, done, infos) i += 1
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=N_anneal) # dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, # processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, # train_interval=4, delta_clip=1.) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, enable_double_dqn=True, processor=None, nb_steps_warmup=5 * episode_len, gamma=.90, target_model_update=100, train_interval=1, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) # dqn.fit(enviro, callbacks=None, nb_steps=1750000, log_interval=10000) weights_filename = 'dqn_{}_weights.h5f'.format('PSF') checkpoint_weights_filename = 'dqn_' + 'PSF' + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format('PSF') callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=100)]