def learn_with_selfplay(max_agents,
                        num_learn_steps,
                        num_learn_steps_pre_training,
                        num_eval_eps,
                        num_skip_steps=0,
                        model_name='dqn',
                        only_rule_based_op=False,
                        patience=5,
                        image_observations=True,
                        output_folder="output",
                        fine_tune_on=None,
                        opponent_pred_obs=False,
                        adversarial_training=None,
                        save_freq=None):
    """
    Train an agent with regular self-play. If there are checkpoints of previous training continue training with the checkpoints.

    :param max_agents: Stop after max_agents intermediate agents have been trained. An intermediate agent is saved when training
    successfully created an improved agent.
    :param num_learn_steps: Number of frames / steps for every learning iteration
    :param num_learn_steps_pre_training: Number of frames / steps for pre-training on the rule-based agent
    :param num_eval_eps: Number of episodes for intermediate evaluation. Intermediate evaluation determines whether trained agent improved
    compared to previous version
    :param num_skip_steps: Skip num_skip_steps frames performing the action from the previous step
    :param model_name: Name for saving the model. If there are already checkpoints with this name training is continued. Checkpoints will be
    saved as madel_namei, where i is the training iteration.
    :param only_rule_based_op: If set to true training is only performed on the rule-based agent.
    :param patience: Patience parameter for evaluation
    :param image_observations: Use image instead of feature observations
    :param output_folder: Root folder for outputs
    :param fine_tune_on: If not None instead of self-play training perform training of an adversarial policy against the victim specified as
    string to this parameter
    :param opponent_pred_obs:
        If this is set to True, the predictions of the opponents in the current state will beconcatenated to the observations for the main
        agent. This was an attempt to create a stronger adversarial policy, which could use this information, however in our experiments
        this didn't improve the adversarial policy
    :param adversarial_training: If set to True perform adversarial training using FGSM during training.
    :param save_freq: If not None save intermediate checkpoints during training with the given frequency
    :return:
    """
    eval_env, eval_env_rule_based, eval_op, train_env, train_env_rule_based = _init_envs(image_observations,
                                                                                         num_skip_steps,
                                                                                         opponent_pred_obs,
                                                                                         adversarial_training)

    # If fine tuning, load model to fine-tune from path
    if fine_tune_on is not None:
        path = Path(output_folder) / 'models' / fine_tune_on
        fine_tune_model = DQN.load(path)
        fine_tune_model.tensorboard_log = None
        if opponent_pred_obs:
            # We can't eval on agents that don't have a q_net so we change eval_op to the original model that is being
            # fine-tuned against, instead of the rule-based agent
            eval_op = fine_tune_model
            eval_env_rule_based.set_opponent(eval_op)
            eval_env_rule_based = OpponentPredictionObs(eval_env_rule_based)
            eval_env.set_opponent(eval_op)
            eval_env = OpponentPredictionObs(eval_env)
    else:
        fine_tune_model = None

    # Initialize first agent
    pre_train_agent = SimpleRuleBasedAgent(train_env_rule_based)
    previous_models = [pre_train_agent]

    # Load potentially saved previous models
    for opponent_id in range(1, max_agents):
        path = _make_model_path(output_folder, model_name, opponent_id)
        if os.path.isfile(path):
            model = DQN.load(path)
            previous_models.append(model)
        else:
            break

    # Initialize first round
    last_agent_id = len(previous_models) - 1
    prev_num_steps = 0
    patience_counter = 0
    tb_path = Path(output_folder) / "tb-log"
    if last_agent_id == 0:
        # main_model = A2C('MlpPolicy', policy_kwargs=dict(optimizer_class=RMSpropTFLike, optimizer_kwargs=dict(eps=1e-5)), env=train_env, verbose=0,
        #                 tensorboard_log="output/tb-log")
        # main_model = A2C('MlpPolicy', train_env, verbose=0, tensorboard_log="output/tb-log")  # , exploration_fraction=0.3)
        main_model = DQN('MlpPolicy', train_env_rule_based, verbose=0, tensorboard_log=tb_path)  # , exploration_fraction=0.3)
    else:
        main_model = copy.deepcopy(previous_models[last_agent_id])
        main_model.set_env(train_env)
        main_model.tensorboard_log = tb_path

    # Start training with self-play over several rounds
    opponent_id = last_agent_id
    while opponent_id < max_agents - 1:
        print(f"Running training round {opponent_id + 1}")
        if fine_tune_on is None:
            # Choose opponent based on setting
            if only_rule_based_op:
                current_train_env = train_env_rule_based
                # Use rule-based as opponent
                current_train_env.set_opponent(SimpleRuleBasedAgent(current_train_env))
            else:
                if opponent_id == 0:
                    current_train_env = train_env_rule_based
                else:
                    current_train_env = train_env
                # Take opponent from the previous version of the model
                current_train_env.set_opponent(previous_models[opponent_id])
        else:  # Use passed fine-tune agent as opponent
            current_train_env = train_env
            current_train_env.set_opponent(fine_tune_model)

        # Train the model
        current_train_env.set_opponent_right_side(True)

        chosen_n_steps = num_learn_steps_pre_training if opponent_id == 0 else num_learn_steps  # Iteration 0 is pre-training

        # In order to generate adversarial examples the adversarial training wrapper needs a references to the model that is
        # currently being trained
        if adversarial_training is not None:
            current_train_env.env.victim_model = main_model

        # Optionally add a callback to save intermediate checkpoints
        if save_freq is not None:
            checkpoint_callback = CheckpointCallback(save_freq=save_freq,
                                                     save_path='./output/intermediate/',
                                                     name_prefix=model_name + str(opponent_id + 1) + '_interm')
        else:
            checkpoint_callback = None

        # === LEARNING ===
        main_model.learn(total_timesteps=chosen_n_steps, tb_log_name=model_name, callback=checkpoint_callback)

        # Do evaluation for this training round
        eval_env_rule_based.set_opponent(eval_op)
        avg_round_reward, num_steps = evaluate(main_model, eval_env_rule_based, num_eps=num_eval_eps)
        print(model_name)
        print(f"Average round reward after training: {avg_round_reward}")
        print(f"Average number of steps per episode: {num_steps / num_eval_eps}")

        # Check if there was improvement
        if num_steps > prev_num_steps:  # Model improved compared to last
            print('Model improved')
            prev_num_steps = num_steps
            # Reset patience counter
            patience_counter = 0

            # Save the further trained model to disk
            main_model.save(_make_model_path(output_folder, model_name, opponent_id + 1))
            # Make a copy of the just saved model by loading it
            copy_of_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id + 1))
            # Save the copy to the list
            previous_models.append(copy_of_model)

            # From here we continue training the same main_model against itself
            opponent_id += 1
        else:
            print('Model did not improve')
            patience_counter += 1
            # Do not save the model
            if patience_counter > patience:
                print('Stopping early due to patience')
                break
            # Because our model did not improve compared to the previous one, we reset our main_model to the previous one
            main_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id))
            main_model.set_env(train_env)

            # Opponent does not change

    if not opponent_pred_obs:
        # Evaluate the last model against each of its previous iterations
        # evaluate_against_predecessors(previous_models, env_rule_based=eval_env_rule_based, env_normal=eval_env, num_eval_eps=num_eval_eps)
        pass  # Not useful right now
예제 #2
0
파일: dqn.py 프로젝트: lusinga/algo
# game = 'YarsRevenge-ram-v0'
# game = 'Zaxxon-v0'
# game = 'Zaxxon-ram-v0'

#env = gym.make('Pong-v0')
env = gym.make(game)

# save_file = 'dqn_pong';
save_file = 'dqn_' + game

print(env.action_space)
print(env.get_action_meanings())

model = DQN(MlpPolicy, env, verbose=1)
#model = DQN.load(save_file)
model.set_env(env)
# model = DQN(CnnPolicy, env, verbose=1)
model.learn(total_timesteps=50000, log_interval=10)
# model.save(save_file)

obs = env.reset()

score = 0
rewards_sum = 0

while True:
    # print(score)
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    score = score + 1
예제 #3
0
파일: train.py 프로젝트: gkswamy98/pillbox
def train_adril(env, n=0, balanced=False):
    num_trajs = 20
    expert_data = make_sa_dataset(env, max_trajs=num_trajs)
    n_expert = len(expert_data["obs"])
    expert_sa = np.concatenate(
        (expert_data["obs"], np.reshape(expert_data["acts"], (n_expert, -1))),
        axis=1)

    for i in range(0, n):
        venv = AdRILWrapper(gym.make(env))
        mean_rewards = []
        std_rewards = []
        # Create model
        if isinstance(venv.action_space, Discrete):
            model = DQN(SQLPolicy,
                        venv,
                        verbose=1,
                        policy_kwargs=dict(net_arch=[64, 64]),
                        learning_starts=1)
        else:
            model = SAC('MlpPolicy',
                        venv,
                        verbose=1,
                        policy_kwargs=dict(net_arch=[256, 256]),
                        ent_coef='auto',
                        learning_rate=linear_schedule(7.3e-4),
                        train_freq=64,
                        gradient_steps=64,
                        gamma=0.98,
                        tau=0.02)
        model.replay_buffer = AdRILReplayBuffer(model.buffer_size,
                                                model.observation_space,
                                                model.action_space,
                                                model.device,
                                                1,
                                                model.optimize_memory_usage,
                                                expert_data=expert_data,
                                                N_expert=num_trajs,
                                                balanced=balanced)
        if not balanced:
            for j in range(len(expert_sa)):
                obs = expert_data["obs"][j]
                act = expert_data["acts"][j]
                next_obs = expert_data["next_obs"][j]
                done = expert_data["dones"][j]
                model.replay_buffer.add(obs, next_obs, act, -1, done)
        for train_steps in range(400):
            # Train policy
            if train_steps > 0:
                if 'Bullet' in env:
                    model.learn(total_timesteps=1250, log_interval=1000)
                else:
                    model.learn(total_timesteps=25000, log_interval=1000)
                if train_steps % 1 == 0:  # written to support more complex update schemes
                    model.replay_buffer.set_iter(train_steps)
                    model.replay_buffer.set_n_learner(venv.num_trajs)

            # Evaluate policy
            if train_steps % 20 == 0:
                model.set_env(gym.make(env))
                mean_reward, std_reward = evaluate_policy(model,
                                                          model.env,
                                                          n_eval_episodes=10)
                mean_rewards.append(mean_reward)
                std_rewards.append(std_reward)
                print("{0} Steps: {1}".format(int(train_steps * 1250),
                                              mean_reward))
                np.savez(os.path.join("learners", env,
                                      "adril_rewards_{0}".format(i)),
                         means=mean_rewards,
                         stds=std_rewards)
            # Update env
            if train_steps > 0:
                if train_steps % 1 == 0:
                    venv.set_iter(train_steps + 1)
            model.set_env(venv)