示例#1
0
def start_training_dqn(is_prioritized):
    if is_prioritized:
        prio = "with_priority"
    else:
        prio = "no_priority"

    env = gym.make(hyperparams['environment'])
    state_spec = len(env.observation_space.sample())
    action_spec = env.action_space.n
    log_name = 'final_build' + prio
    log_dir = 'logs/acrobot/' + log_name

    log_writer = tf.summary.create_file_writer(log_dir)

    epsilon = hyperparams['epsilon']
    buffer = PrioritizedReplay(
        hyperparams['max_experiences']) if is_prioritized else UniformReplay(
            hyperparams['max_experiences'])

    agent = DQNAgent(hyperparams['hidden_layer_dqn'], state_spec, action_spec,
                     buffer, hyperparams['learning_rate_dqn'], is_prioritized)

    total_rewards = np.empty(hyperparams['episodes'])
    for episode in range(hyperparams['episodes']):
        episode_reward = 0
        epsilon = max(hyperparams['min_epsilon'],
                      epsilon * hyperparams['decay'])
        done = False
        state = env.reset()
        while not done:

            action = agent.play_action(state, epsilon)
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward

            buffer.add((state, action, reward, next_state, done))
            state = next_state

            if len(buffer.experiences) > hyperparams['min_experiences']:
                agent.train(hyperparams['gamma'], hyperparams['batch_size'])

        total_rewards[episode] = episode_reward
        avg_rewards = total_rewards[max(0, episode - 20):(episode + 1)].mean()
        env.reset()

        with log_writer.as_default():
            tf.summary.scalar('episode reward', episode_reward, step=episode)
            tf.summary.scalar('avg for 20 episodes', avg_rewards, step=episode)
    agent.network.save_weights('dqn_{}_network.h5'.format(prio))
    env.close()
示例#2
0
def run_exp(cfg=None):
    logger = Logger(cfg)
    agent = DQNAgent(cfg)
    env = Env(cfg)
    trainer = Trainer(env, agent, cfg)

    cfg = cfg.exp
    n_training_steps = cfg.n_episodes // cfg.train_after
    global_step = 0
    state = env.reset()
    joint_angles = np.empty(cfg.n_episodes)
    for step in range(cfg.n_episodes):
        state = trainer.single_step(state)
        # agent training
        if global_step % cfg.train_after == (cfg.train_after - 1):
            print(f"step: {step}")
            print("Training agents")
            # fw model warmup phase of 2000 steps
            metrics_dict = agent.train(
                cfg.train_iv, cfg.train_fw,
                cfg.train_policy if global_step >= 0 else False)
            logger.log_metrics(metrics_dict, global_step)
            logger.log_all_network_weights(agent.joint_agents[0], step)
            agent.decrease_eps(n_training_steps)

        # video logging
        if global_step % cfg.video_after == 0:
            print("logging video")
            vis, debug0, debug1 = trainer.record_frames(debug_cams=True)
            logger.log_vid_debug_cams(vis, debug0, debug1, global_step)

        # distractor toggling
        if global_step % cfg.toggle_table_after == (cfg.toggle_table_after -
                                                    1):
            env.toggle_table()

        global_step += 1
        pos = env.get_joint_positions()[0]
        joint_angles[step] = pos

    joint_angles = np.degrees(-joint_angles)
    plt.hist(joint_angles, bins=20, range=(0, 170))
    plt.savefig(os.path.join("plots", "explored_angles.png"))
示例#3
0
                #avg_reward = 0

            if (episode_reward > max_reward):
                max_reward = episode_reward
                logger.log_best(episode, max_reward)
                #print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
                #print("New Max-Reward: ", episode_reward)
                #print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
                # save model if we have reached new max reward
                name = str(episode_reward) + "_max_agent_.h5"
                logger.save_model(agent.model, name)
        # train_dqn_agent if we have to enough replay memory, otherwise continue while-loop
        if (len(replay_memory.memory) < min_replay_memory):
            continue

        agent.train()

    #dynamic epsilon
    if (dynamic_epsilon and epsilon > min_epsilon
            and replay_memory_size > min_replay_memory):
        epsilon *= epsilon_decay * epsilon_reward_scale
    else:
        if (epsilon > min_epsilon
                and len(replay_memory.memory) > min_replay_memory):
            epsilon *= epsilon_decay

done = True
parameter_list = {}
parameter_list['end_epsilon'] = epsilon  # not right yet
parameter_list['max_reward'] = max_reward
parameter_list['done'] = done
示例#4
0
        h4 = F.relu(self.l4(h3))
        output = self.l5(h4)
        return output


net = Linear()

print('Initializing the learner...')
learner = Learner(settings)
learner.load_net(net)

print('Initializing the agent framework...')
agent = DQNAgent(settings)

print('Training...')
agent.train(learner, memory, simulator)

print('Loading the net...')
learner = agent.load(settings['save_dir'] + '/learner_final.p')

ind_max = learner.val_rewards.index(max(learner.val_rewards))
ind_net = settings['initial_exploration'] + ind_max * settings['eval_every']
agent.load_net(learner, settings['save_dir'] + '/net_%d.p' % int(ind_net))

np.random.seed(settings["seed_general"])

print('Evaluating DQN agent...')
print('(reward, MSE loss, mean Q-value, episodes - NA, time)')
reward, MSE_loss, mean_Q_value, episodes, time, paths, actions, rewards = agent.evaluate(
    learner, simulator, 50000)
print(reward, MSE_loss, mean_Q_value, episodes, time)