def record(filename):
    env = DotaEnvironment()

    state = env.reset()
    states = [transform_into_pair(state)]
    done = False
    while not done:
        next_state, reward, done = env.execute(action=0)
        states.append(transform_into_pair(next_state))

    with open(filename, 'wb') as output_file:
        pickle.dump(states, output_file)
示例#2
0
def main(args):
    # configure logger, disable logging in child MPI processes (with rank > 0)
    np.set_printoptions(precision=3)

    arg_parser = common_arg_parser()
    arg_parser.add_argument('--id',
                            help='name of the experiment for saving',
                            type=str,
                            default=None)
    arg_parser.add_argument('--config',
                            help='path to the algorithm config',
                            type=str,
                            default=None)
    args, unknown_args = arg_parser.parse_known_args(args)
    extra_args = parse_cmdline_kwargs(unknown_args)

    if args.id is None:
        print('Please, specify the name of the experiment in --id')
        exit(0)

    if args.config is None:
        print('Please, specify the path to the algorithm config via --config')
        exit(0)

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        logger.configure()
    else:
        logger.configure(format_strs=[])
        rank = MPI.COMM_WORLD.Get_rank()

    train(args, extra_args)
    return

    if args.save_path is not None and rank == 0:
        save_path = osp.expanduser(args.save_path)
        model.save(save_path)

    if args.play:
        logger.log("Running trained model")
        env = DotaEnvironment()
        obs = env.reset()

        def initialize_placeholders(nlstm=128, **kwargs):
            return np.zeros((args.num_env or 1, 2 * nlstm)), np.zeros((1))

        state, dones = initialize_placeholders(**extra_args)
        while True:
            actions, _, state, _ = model.step(obs, S=state, M=dones)
            obs, _, done, _ = env.step(actions)
            env.render()
            done = done.any() if isinstance(done, np.ndarray) else done

            if done:
                obs = env.reset()
        env.close()
示例#3
0
def main():
    parser = argparse.ArgumentParser(description='Trains the agent by DQN')
    parser.add_argument('experiment', help='specifies the experiment name')
    args = parser.parse_args()

    env = DotaEnvironment()

    # Where we save our checkpoints and graphs
    experiment_dir = os.path.join(os.path.abspath("./experiments/"),
                                  args.experiment)

    tf.reset_default_graph()
    # Create a global step variable
    global_step = tf.Variable(0, name="global_step", trainable=False)

    # Create estimators
    q_estimator = Estimator(STATE_SPACE,
                            ACTION_SPACE,
                            scope="q",
                            summaries_dir=experiment_dir)
    target_estimator = Estimator(STATE_SPACE, ACTION_SPACE, scope="target_q")

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        deep_q_learning(sess=sess,
                        env=env,
                        q_estimator=q_estimator,
                        target_estimator=target_estimator,
                        experiment_dir=experiment_dir,
                        num_steps=200000,
                        replay_memory_size=10000,
                        epsilon_decay_steps=1,
                        epsilon_start=0.1,
                        epsilon_end=0.1,
                        update_target_estimator_every=1000,
                        update_q_values_every=4,
                        batch_size=32,
                        restore=False)

    env.close()
def record(filename):
    env = DotaEnvironment()

    env.reset()
    state_action_pairs = []
    done = False
    while not done:
        pairs = env.step(action=ATTACK_CREEP)
        for _, (state, _, done, info) in pairs:
            state_action_pairs.append((state, info))
    print('Frames recorded:', len(state_action_pairs))

    filtered = []
    last_state = None
    for state, info in state_action_pairs:
        if last_state is not None and np.linalg.norm(last_state - state) == 0:
            continue
        last_state = state
        filtered.append((state, info))
    print('After filtering:', len(filtered))

    with open(filename, 'wb') as output_file:
        pickle.dump(filtered, output_file)
示例#5
0
def train(args, extra_args):
    env_type = 'steam'
    env_id = 'dota2'
    print('env_type: {}'.format(env_type))

    seed = args.seed

    alg_kwargs = dict(network=models.mlp(num_hidden=128, num_layers=1),
                      lr=1e-3,
                      buffer_size=10000,
                      total_timesteps=500000,
                      exploration_fraction=1.0,
                      exploration_initial_eps=0.1,
                      exploration_final_eps=0.1,
                      train_freq=4,
                      target_network_update_freq=1000,
                      gamma=0.999,
                      batch_size=32,
                      prioritized_replay=True,
                      prioritized_replay_alpha=0.6,
                      experiment_name=args.exp_name,
                      dueling=True)
    alg_kwargs.update(extra_args)

    env = DotaEnvironment()

    if args.network:
        alg_kwargs['network'] = args.network
    else:
        if alg_kwargs.get('network') is None:
            alg_kwargs['network'] = get_default_network(env_type)

    print('Training {} on {}:{} with arguments \n{}'.format(
        args.alg, env_type, env_id, alg_kwargs))

    pool_size = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=pool_size) as pool:
        model = learn(env=env, seed=seed, pool=pool, **alg_kwargs)

    return model, env
示例#6
0
def main():
    # Check gradient implementation

    n_itrs = 10000
    env = DotaEnvironment()
    rng = np.random.RandomState(42)
    timestep_limit = 10000
    learning_rate = 0.1
    discount = 0.99

    batch_size = 100
    # Initialize parameters
    theta = rng.normal(scale=0.2,
                       size=(env.action_space[0],
                             env.observation_space[0] + 1))

    # Store baselines for each time step.
    baselines = np.zeros(timestep_limit)

    # Policy training loop
    for itr in range(n_itrs):
        # Collect trajectory loop
        n_samples = 0
        grad = np.zeros_like(theta)
        episode_rewards = []

        # Store cumulative returns for each time step
        all_returns = [[] for _ in range(timestep_limit)]

        all_observations = []
        all_actions = []

        while n_samples < batch_size:
            observations = []
            actions = []
            rewards = []
            ob = env.reset()
            done = False
            # Collect a new trajectory
            print('collecting')
            print(n_samples, batch_size)
            while not done:
                action = point_get_action(theta, ob, rng=rng)
                next_ob, rew, done = env.step(action)
                observations.append(ob)
                actions.append(action)
                rewards.append(rew)
                ob = next_ob
                n_samples += 1
            # Go back in time to compute returns and accumulate gradient
            # Compute the gradient along this trajectory
            R = 0.
            for t in reversed(range(len(observations))):

                def compute_update(discount, R_tplus1, theta, s_t, a_t, r_t,
                                   b_t, get_grad_logp_action):
                    """
                    :param discount: A scalar
                    :param R_tplus1: A scalar
                    :param theta: A matrix of size |A| * (|S|+1)
                    :param s_t: A vector of size |S|
                    :param a_t: Either a vector of size |A| or an integer, depending on the environment
                    :param r_t: A scalar
                    :param b_t: A scalar
                    :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a
                    matrix of size |A| * (|S|+1) )
                    :return: A tuple, consisting of a scalar and a matrix of size |A| * (|S|+1)
                    """
                    R_t = discount * R_tplus1 + r_t
                    A_t = R_t - b_t
                    pg_theta = get_grad_logp_action(theta, s_t, a_t) * A_t

                    return R_t, pg_theta

                R, grad_t = compute_update(
                    discount=discount,
                    R_tplus1=R,
                    theta=theta,
                    s_t=observations[t],
                    a_t=actions[t],
                    r_t=rewards[t],
                    b_t=baselines[t],
                    get_grad_logp_action=point_get_grad_logp_action)
                all_returns[t].append(R)
                grad += grad_t

            episode_rewards.append(np.sum(rewards))
            all_observations.extend(observations)
            all_actions.extend(actions)

        baselines = np.zeros(timestep_limit)

        # Roughly normalize the gradient
        grad = grad / (np.linalg.norm(grad) + 1e-8)

        theta += learning_rate * grad

        print("Iteration: %d AverageReturn: %.2f |theta|_2: %.2f" %
              (itr, np.mean(episode_rewards), np.linalg.norm(theta)))
示例#7
0
#!/usr/bin/env python3

from dotaenv import DotaEnvironment

import numpy as np

from tensorforce.agents import TRPOAgent
from tensorforce.execution import Runner
import os

# Create an environment
env = DotaEnvironment()

network_spec = [
    dict(type='dense', size=172, activation='tanh'),
    dict(type='dense', size=172, activation='tanh'),
    dict(type='dense', size=172, activation='tanh'),
    dict(type='dense', size=172, activation='tanh'),
    dict(type='dense', size=172, activation='tanh'),
    dict(type='dense', size=172, activation='tanh'),
    dict(type='dense', size=172, activation='tanh'),
]

agent = TRPOAgent(
    actions=env.actions,
    states=env.states,
    discount=0.99,
    network=network_spec,
)

示例#8
0
def do_agent_exploration(updates_queue: multiprocessing.Queue,
                         q_func_vars_trained_queue: multiprocessing.Queue,
                         network, seed, config, lr, total_timesteps,
                         learning_starts, buffer_size, exploration_fraction,
                         exploration_initial_eps, exploration_final_eps,
                         train_freq, batch_size, print_freq, checkpoint_freq,
                         gamma, target_network_update_freq, prioritized_replay,
                         prioritized_replay_alpha, prioritized_replay_beta0,
                         prioritized_replay_beta_iters, prioritized_replay_eps,
                         experiment_name, load_path, network_kwargs):
    env = DotaEnvironment()

    sess = get_session()
    set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space = env.observation_space

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, _, _, debug = deepq.build_train(
        scope='deepq_act',
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=exploration_initial_eps,
                                 final_p=exploration_final_eps)

    U.initialize()

    reward_shaper = ActionAdviceRewardShaper(config=config)
    reward_shaper.load()
    reward_shaper.generate_merged_demo()

    full_exp_name = '{}-{}'.format(date.today().strftime('%Y%m%d'),
                                   experiment_name)
    experiment_dir = os.path.join('experiments', full_exp_name)
    os.makedirs(experiment_dir, exist_ok=True)

    summary_dir = os.path.join(experiment_dir, 'summaries')
    os.makedirs(summary_dir, exist_ok=True)
    summary_writer = tf.summary.FileWriter(summary_dir)
    checkpoint_dir = os.path.join(experiment_dir, 'checkpoints')
    os.makedirs(checkpoint_dir, exist_ok=True)
    stats_dir = os.path.join(experiment_dir, 'stats')
    os.makedirs(stats_dir, exist_ok=True)

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_dir or td

        os.makedirs(td, exist_ok=True)
        model_file = os.path.join(td, "best_model")
        model_saved = False
        saved_mean_reward = None

        # if os.path.exists(model_file):
        #     print('Model is loading')
        #     load_variables(model_file)
        #     logger.log('Loaded model from {}'.format(model_file))
        #     model_saved = True
        # elif load_path is not None:
        #     load_variables(load_path)
        #     logger.log('Loaded model from {}'.format(load_path))

        def synchronize_q_func_vars():
            updates_queue.put(
                UpdateMessage(UPDATE_STATUS_SEND_WEIGHTS, None, None))
            q_func_vars_trained = q_func_vars_trained_queue.get()
            update_q_func_expr = []
            for var, var_trained in zip(debug['q_func_vars'],
                                        q_func_vars_trained):
                update_q_func_expr.append(var.assign(var_trained))
            update_q_func_expr = tf.group(*update_q_func_expr)
            sess.run(update_q_func_expr)

        synchronize_q_func_vars()

        episode_rewards = []
        act_step_t = 0
        while act_step_t < total_timesteps:
            # Reset the environment
            obs = env.reset()
            obs = StatePreprocessor.process(obs)
            episode_rewards.append(0.0)
            done = False
            # Demo preservation variables
            demo_picked = 0
            demo_picked_step = 0
            # Demo switching statistics
            demo_switching_stats = [(0, 0)]
            # Sample the episode until it is completed
            act_started_step_t = act_step_t
            while not done:
                # Take action and update exploration to the newest value
                biases, demo_indexes = reward_shaper.get_action_potentials_with_indexes(
                    obs, act_step_t)
                update_eps = exploration.value(act_step_t)
                actions, is_randoms = act(np.array(obs)[None],
                                          biases,
                                          update_eps=update_eps)
                action, is_random = actions[0], is_randoms[0]
                if not is_random:
                    bias_demo = demo_indexes[action]
                    if bias_demo != demo_switching_stats[-1][1]:
                        demo_switching_stats.append(
                            (act_step_t - act_started_step_t, bias_demo))
                    if bias_demo != 0 and demo_picked == 0:
                        demo_picked = bias_demo
                        demo_picked_step = act_step_t + 1
                pairs = env.step(action)
                action, (new_obs, rew, done, _) = pairs[-1]
                logger.log(
                    f'{act_step_t}/{total_timesteps} obs {obs} action {action}'
                )

                # Compute state on the real reward but learn from the normalized version
                episode_rewards[-1] += rew
                rew = np.sign(rew) * np.log(1 + np.abs(rew))
                new_obs = StatePreprocessor.process(new_obs)

                if len(new_obs) == 0:
                    done = True
                else:
                    transition = (obs, action, rew, new_obs, float(done),
                                  act_step_t)
                    obs = new_obs
                    act_step_t += 1
                    if act_step_t - demo_picked_step >= MIN_STEPS_TO_FOLLOW_DEMO_FOR:
                        demo_picked = 0
                    reward_shaper.set_demo_picked(act_step_t, demo_picked)
                    updates_queue.put(
                        UpdateMessage(UPDATE_STATUS_CONTINUE, transition,
                                      demo_picked))
            # Post episode logging
            summary = tf.Summary(value=[
                tf.Summary.Value(tag="rewards",
                                 simple_value=episode_rewards[-1])
            ])
            summary_writer.add_summary(summary, act_step_t)
            summary = tf.Summary(
                value=[tf.Summary.Value(tag="eps", simple_value=update_eps)])
            summary_writer.add_summary(summary, act_step_t)
            summary = tf.Summary(value=[
                tf.Summary.Value(tag="episode_steps",
                                 simple_value=act_step_t - act_started_step_t)
            ])
            summary_writer.add_summary(summary, act_step_t)
            mean_5ep_reward = round(float(np.mean(episode_rewards[-5:])), 1)
            num_episodes = len(episode_rewards)
            if print_freq is not None and num_episodes % print_freq == 0:
                logger.record_tabular("steps", act_step_t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 5 episode reward", mean_5ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(act_step_t)))
                logger.dump_tabular()
            # Wait for the learning to finish and synchronize
            synchronize_q_func_vars()
            # Record demo_switching_stats
            if num_episodes % 10 == 0:
                save_demo_switching_stats(demo_switching_stats, stats_dir,
                                          num_episodes)
            if checkpoint_freq is not None and num_episodes % checkpoint_freq == 0:
                # Periodically save the model
                rec_model_file = os.path.join(
                    td, "model_{}_{:.2f}".format(num_episodes,
                                                 mean_5ep_reward))
                save_variables(rec_model_file)
                # Check whether the model is the best so far
                if saved_mean_reward is None or mean_5ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_5ep_reward))
                    save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_5ep_reward

        updates_queue.put(UpdateMessage(UPDATE_STATUS_FINISH, None, None))
示例#9
0
 def make_obs_ph(name):
     return ObservationInput(DotaEnvironment.get_observation_space(),
                             name=name)
示例#10
0
def do_network_training(updates_queue: multiprocessing.Queue,
                        weights_queue: multiprocessing.Queue, network, seed,
                        config, lr, total_timesteps, learning_starts,
                        buffer_size, exploration_fraction,
                        exploration_initial_eps, exploration_final_eps,
                        train_freq, batch_size, print_freq, checkpoint_freq,
                        gamma, target_network_update_freq, prioritized_replay,
                        prioritized_replay_alpha, prioritized_replay_beta0,
                        prioritized_replay_beta_iters, prioritized_replay_eps,
                        experiment_name, load_path, network_kwargs):
    _ = get_session()
    set_global_seeds(seed)
    q_func = build_q_func(network, **network_kwargs)

    def make_obs_ph(name):
        return ObservationInput(DotaEnvironment.get_observation_space(),
                                name=name)

    _, train, update_target, debug = deepq.build_train(
        scope='deepq_train',
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=DotaEnvironment.get_action_space().n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
    )

    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    U.initialize()
    update_target()

    reward_shaper = ActionAdviceRewardShaper(config=config)
    reward_shaper.load()
    reward_shaper.generate_merged_demo()

    full_exp_name = '{}-{}'.format(date.today().strftime('%Y%m%d'),
                                   experiment_name)
    experiment_dir = os.path.join('experiments', full_exp_name)
    os.makedirs(experiment_dir, exist_ok=True)

    learning_dir = os.path.join(experiment_dir, 'learning')
    learning_summary_writer = tf.summary.FileWriter(learning_dir)

    update_step_t = 0
    should_finish = False
    while not should_finish:
        message = updates_queue.get()
        logger.log(f'do_network_training ← {message}')
        if message.status == UPDATE_STATUS_CONTINUE:
            transition = message.transition
            replay_buffer.add(*transition)
            next_act_step = transition[5] + 1
            reward_shaper.set_demo_picked(next_act_step, message.demo_picked)

            if update_step_t >= learning_starts and update_step_t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(update_step_t))
                    (obses_t, actions, rewards, obses_tp1, dones, ts, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones, ts = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                biases_t = []
                for obs_t, timestep in zip(obses_t, ts):
                    biases_t.append(
                        reward_shaper.get_action_potentials(obs_t, timestep))
                biases_tp1 = []
                for obs_tp1, timestep in zip(obses_tp1, ts):
                    biases_tp1.append(
                        reward_shaper.get_action_potentials(
                            obs_tp1, timestep + 1))
                td_errors, weighted_error = train(obses_t, biases_t, actions,
                                                  rewards, obses_tp1,
                                                  biases_tp1, dones, weights)
                # Loss logging
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag='weighted_error',
                                     simple_value=weighted_error)
                ])
                learning_summary_writer.add_summary(summary, update_step_t)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)
            if update_step_t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()
            update_step_t += 1
        elif message.status == UPDATE_STATUS_SEND_WEIGHTS:
            q_func_vars = get_session().run(debug['q_func_vars'])
            weights_queue.put(q_func_vars)
        elif message.status == UPDATE_STATUS_FINISH:
            should_finish = True
        else:
            logger.log(f'Unknown status in UpdateMessage: {message.status}')