def record(filename): env = DotaEnvironment() state = env.reset() states = [transform_into_pair(state)] done = False while not done: next_state, reward, done = env.execute(action=0) states.append(transform_into_pair(next_state)) with open(filename, 'wb') as output_file: pickle.dump(states, output_file)
def main(args): # configure logger, disable logging in child MPI processes (with rank > 0) np.set_printoptions(precision=3) arg_parser = common_arg_parser() arg_parser.add_argument('--id', help='name of the experiment for saving', type=str, default=None) arg_parser.add_argument('--config', help='path to the algorithm config', type=str, default=None) args, unknown_args = arg_parser.parse_known_args(args) extra_args = parse_cmdline_kwargs(unknown_args) if args.id is None: print('Please, specify the name of the experiment in --id') exit(0) if args.config is None: print('Please, specify the path to the algorithm config via --config') exit(0) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 logger.configure() else: logger.configure(format_strs=[]) rank = MPI.COMM_WORLD.Get_rank() train(args, extra_args) return if args.save_path is not None and rank == 0: save_path = osp.expanduser(args.save_path) model.save(save_path) if args.play: logger.log("Running trained model") env = DotaEnvironment() obs = env.reset() def initialize_placeholders(nlstm=128, **kwargs): return np.zeros((args.num_env or 1, 2 * nlstm)), np.zeros((1)) state, dones = initialize_placeholders(**extra_args) while True: actions, _, state, _ = model.step(obs, S=state, M=dones) obs, _, done, _ = env.step(actions) env.render() done = done.any() if isinstance(done, np.ndarray) else done if done: obs = env.reset() env.close()
def main(): parser = argparse.ArgumentParser(description='Trains the agent by DQN') parser.add_argument('experiment', help='specifies the experiment name') args = parser.parse_args() env = DotaEnvironment() # Where we save our checkpoints and graphs experiment_dir = os.path.join(os.path.abspath("./experiments/"), args.experiment) tf.reset_default_graph() # Create a global step variable global_step = tf.Variable(0, name="global_step", trainable=False) # Create estimators q_estimator = Estimator(STATE_SPACE, ACTION_SPACE, scope="q", summaries_dir=experiment_dir) target_estimator = Estimator(STATE_SPACE, ACTION_SPACE, scope="target_q") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) deep_q_learning(sess=sess, env=env, q_estimator=q_estimator, target_estimator=target_estimator, experiment_dir=experiment_dir, num_steps=200000, replay_memory_size=10000, epsilon_decay_steps=1, epsilon_start=0.1, epsilon_end=0.1, update_target_estimator_every=1000, update_q_values_every=4, batch_size=32, restore=False) env.close()
def record(filename): env = DotaEnvironment() env.reset() state_action_pairs = [] done = False while not done: pairs = env.step(action=ATTACK_CREEP) for _, (state, _, done, info) in pairs: state_action_pairs.append((state, info)) print('Frames recorded:', len(state_action_pairs)) filtered = [] last_state = None for state, info in state_action_pairs: if last_state is not None and np.linalg.norm(last_state - state) == 0: continue last_state = state filtered.append((state, info)) print('After filtering:', len(filtered)) with open(filename, 'wb') as output_file: pickle.dump(filtered, output_file)
def train(args, extra_args): env_type = 'steam' env_id = 'dota2' print('env_type: {}'.format(env_type)) seed = args.seed alg_kwargs = dict(network=models.mlp(num_hidden=128, num_layers=1), lr=1e-3, buffer_size=10000, total_timesteps=500000, exploration_fraction=1.0, exploration_initial_eps=0.1, exploration_final_eps=0.1, train_freq=4, target_network_update_freq=1000, gamma=0.999, batch_size=32, prioritized_replay=True, prioritized_replay_alpha=0.6, experiment_name=args.exp_name, dueling=True) alg_kwargs.update(extra_args) env = DotaEnvironment() if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) pool_size = multiprocessing.cpu_count() with multiprocessing.Pool(processes=pool_size) as pool: model = learn(env=env, seed=seed, pool=pool, **alg_kwargs) return model, env
def main(): # Check gradient implementation n_itrs = 10000 env = DotaEnvironment() rng = np.random.RandomState(42) timestep_limit = 10000 learning_rate = 0.1 discount = 0.99 batch_size = 100 # Initialize parameters theta = rng.normal(scale=0.2, size=(env.action_space[0], env.observation_space[0] + 1)) # Store baselines for each time step. baselines = np.zeros(timestep_limit) # Policy training loop for itr in range(n_itrs): # Collect trajectory loop n_samples = 0 grad = np.zeros_like(theta) episode_rewards = [] # Store cumulative returns for each time step all_returns = [[] for _ in range(timestep_limit)] all_observations = [] all_actions = [] while n_samples < batch_size: observations = [] actions = [] rewards = [] ob = env.reset() done = False # Collect a new trajectory print('collecting') print(n_samples, batch_size) while not done: action = point_get_action(theta, ob, rng=rng) next_ob, rew, done = env.step(action) observations.append(ob) actions.append(action) rewards.append(rew) ob = next_ob n_samples += 1 # Go back in time to compute returns and accumulate gradient # Compute the gradient along this trajectory R = 0. for t in reversed(range(len(observations))): def compute_update(discount, R_tplus1, theta, s_t, a_t, r_t, b_t, get_grad_logp_action): """ :param discount: A scalar :param R_tplus1: A scalar :param theta: A matrix of size |A| * (|S|+1) :param s_t: A vector of size |S| :param a_t: Either a vector of size |A| or an integer, depending on the environment :param r_t: A scalar :param b_t: A scalar :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix of size |A| * (|S|+1) ) :return: A tuple, consisting of a scalar and a matrix of size |A| * (|S|+1) """ R_t = discount * R_tplus1 + r_t A_t = R_t - b_t pg_theta = get_grad_logp_action(theta, s_t, a_t) * A_t return R_t, pg_theta R, grad_t = compute_update( discount=discount, R_tplus1=R, theta=theta, s_t=observations[t], a_t=actions[t], r_t=rewards[t], b_t=baselines[t], get_grad_logp_action=point_get_grad_logp_action) all_returns[t].append(R) grad += grad_t episode_rewards.append(np.sum(rewards)) all_observations.extend(observations) all_actions.extend(actions) baselines = np.zeros(timestep_limit) # Roughly normalize the gradient grad = grad / (np.linalg.norm(grad) + 1e-8) theta += learning_rate * grad print("Iteration: %d AverageReturn: %.2f |theta|_2: %.2f" % (itr, np.mean(episode_rewards), np.linalg.norm(theta)))
#!/usr/bin/env python3 from dotaenv import DotaEnvironment import numpy as np from tensorforce.agents import TRPOAgent from tensorforce.execution import Runner import os # Create an environment env = DotaEnvironment() network_spec = [ dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), ] agent = TRPOAgent( actions=env.actions, states=env.states, discount=0.99, network=network_spec, )
def do_agent_exploration(updates_queue: multiprocessing.Queue, q_func_vars_trained_queue: multiprocessing.Queue, network, seed, config, lr, total_timesteps, learning_starts, buffer_size, exploration_fraction, exploration_initial_eps, exploration_final_eps, train_freq, batch_size, print_freq, checkpoint_freq, gamma, target_network_update_freq, prioritized_replay, prioritized_replay_alpha, prioritized_replay_beta0, prioritized_replay_beta_iters, prioritized_replay_eps, experiment_name, load_path, network_kwargs): env = DotaEnvironment() sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, _, _, debug = deepq.build_train( scope='deepq_act', make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=exploration_initial_eps, final_p=exploration_final_eps) U.initialize() reward_shaper = ActionAdviceRewardShaper(config=config) reward_shaper.load() reward_shaper.generate_merged_demo() full_exp_name = '{}-{}'.format(date.today().strftime('%Y%m%d'), experiment_name) experiment_dir = os.path.join('experiments', full_exp_name) os.makedirs(experiment_dir, exist_ok=True) summary_dir = os.path.join(experiment_dir, 'summaries') os.makedirs(summary_dir, exist_ok=True) summary_writer = tf.summary.FileWriter(summary_dir) checkpoint_dir = os.path.join(experiment_dir, 'checkpoints') os.makedirs(checkpoint_dir, exist_ok=True) stats_dir = os.path.join(experiment_dir, 'stats') os.makedirs(stats_dir, exist_ok=True) with tempfile.TemporaryDirectory() as td: td = checkpoint_dir or td os.makedirs(td, exist_ok=True) model_file = os.path.join(td, "best_model") model_saved = False saved_mean_reward = None # if os.path.exists(model_file): # print('Model is loading') # load_variables(model_file) # logger.log('Loaded model from {}'.format(model_file)) # model_saved = True # elif load_path is not None: # load_variables(load_path) # logger.log('Loaded model from {}'.format(load_path)) def synchronize_q_func_vars(): updates_queue.put( UpdateMessage(UPDATE_STATUS_SEND_WEIGHTS, None, None)) q_func_vars_trained = q_func_vars_trained_queue.get() update_q_func_expr = [] for var, var_trained in zip(debug['q_func_vars'], q_func_vars_trained): update_q_func_expr.append(var.assign(var_trained)) update_q_func_expr = tf.group(*update_q_func_expr) sess.run(update_q_func_expr) synchronize_q_func_vars() episode_rewards = [] act_step_t = 0 while act_step_t < total_timesteps: # Reset the environment obs = env.reset() obs = StatePreprocessor.process(obs) episode_rewards.append(0.0) done = False # Demo preservation variables demo_picked = 0 demo_picked_step = 0 # Demo switching statistics demo_switching_stats = [(0, 0)] # Sample the episode until it is completed act_started_step_t = act_step_t while not done: # Take action and update exploration to the newest value biases, demo_indexes = reward_shaper.get_action_potentials_with_indexes( obs, act_step_t) update_eps = exploration.value(act_step_t) actions, is_randoms = act(np.array(obs)[None], biases, update_eps=update_eps) action, is_random = actions[0], is_randoms[0] if not is_random: bias_demo = demo_indexes[action] if bias_demo != demo_switching_stats[-1][1]: demo_switching_stats.append( (act_step_t - act_started_step_t, bias_demo)) if bias_demo != 0 and demo_picked == 0: demo_picked = bias_demo demo_picked_step = act_step_t + 1 pairs = env.step(action) action, (new_obs, rew, done, _) = pairs[-1] logger.log( f'{act_step_t}/{total_timesteps} obs {obs} action {action}' ) # Compute state on the real reward but learn from the normalized version episode_rewards[-1] += rew rew = np.sign(rew) * np.log(1 + np.abs(rew)) new_obs = StatePreprocessor.process(new_obs) if len(new_obs) == 0: done = True else: transition = (obs, action, rew, new_obs, float(done), act_step_t) obs = new_obs act_step_t += 1 if act_step_t - demo_picked_step >= MIN_STEPS_TO_FOLLOW_DEMO_FOR: demo_picked = 0 reward_shaper.set_demo_picked(act_step_t, demo_picked) updates_queue.put( UpdateMessage(UPDATE_STATUS_CONTINUE, transition, demo_picked)) # Post episode logging summary = tf.Summary(value=[ tf.Summary.Value(tag="rewards", simple_value=episode_rewards[-1]) ]) summary_writer.add_summary(summary, act_step_t) summary = tf.Summary( value=[tf.Summary.Value(tag="eps", simple_value=update_eps)]) summary_writer.add_summary(summary, act_step_t) summary = tf.Summary(value=[ tf.Summary.Value(tag="episode_steps", simple_value=act_step_t - act_started_step_t) ]) summary_writer.add_summary(summary, act_step_t) mean_5ep_reward = round(float(np.mean(episode_rewards[-5:])), 1) num_episodes = len(episode_rewards) if print_freq is not None and num_episodes % print_freq == 0: logger.record_tabular("steps", act_step_t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 5 episode reward", mean_5ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(act_step_t))) logger.dump_tabular() # Wait for the learning to finish and synchronize synchronize_q_func_vars() # Record demo_switching_stats if num_episodes % 10 == 0: save_demo_switching_stats(demo_switching_stats, stats_dir, num_episodes) if checkpoint_freq is not None and num_episodes % checkpoint_freq == 0: # Periodically save the model rec_model_file = os.path.join( td, "model_{}_{:.2f}".format(num_episodes, mean_5ep_reward)) save_variables(rec_model_file) # Check whether the model is the best so far if saved_mean_reward is None or mean_5ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_5ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_5ep_reward updates_queue.put(UpdateMessage(UPDATE_STATUS_FINISH, None, None))
def make_obs_ph(name): return ObservationInput(DotaEnvironment.get_observation_space(), name=name)
def do_network_training(updates_queue: multiprocessing.Queue, weights_queue: multiprocessing.Queue, network, seed, config, lr, total_timesteps, learning_starts, buffer_size, exploration_fraction, exploration_initial_eps, exploration_final_eps, train_freq, batch_size, print_freq, checkpoint_freq, gamma, target_network_update_freq, prioritized_replay, prioritized_replay_alpha, prioritized_replay_beta0, prioritized_replay_beta_iters, prioritized_replay_eps, experiment_name, load_path, network_kwargs): _ = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) def make_obs_ph(name): return ObservationInput(DotaEnvironment.get_observation_space(), name=name) _, train, update_target, debug = deepq.build_train( scope='deepq_train', make_obs_ph=make_obs_ph, q_func=q_func, num_actions=DotaEnvironment.get_action_space().n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, ) if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None U.initialize() update_target() reward_shaper = ActionAdviceRewardShaper(config=config) reward_shaper.load() reward_shaper.generate_merged_demo() full_exp_name = '{}-{}'.format(date.today().strftime('%Y%m%d'), experiment_name) experiment_dir = os.path.join('experiments', full_exp_name) os.makedirs(experiment_dir, exist_ok=True) learning_dir = os.path.join(experiment_dir, 'learning') learning_summary_writer = tf.summary.FileWriter(learning_dir) update_step_t = 0 should_finish = False while not should_finish: message = updates_queue.get() logger.log(f'do_network_training ← {message}') if message.status == UPDATE_STATUS_CONTINUE: transition = message.transition replay_buffer.add(*transition) next_act_step = transition[5] + 1 reward_shaper.set_demo_picked(next_act_step, message.demo_picked) if update_step_t >= learning_starts and update_step_t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(update_step_t)) (obses_t, actions, rewards, obses_tp1, dones, ts, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones, ts = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None biases_t = [] for obs_t, timestep in zip(obses_t, ts): biases_t.append( reward_shaper.get_action_potentials(obs_t, timestep)) biases_tp1 = [] for obs_tp1, timestep in zip(obses_tp1, ts): biases_tp1.append( reward_shaper.get_action_potentials( obs_tp1, timestep + 1)) td_errors, weighted_error = train(obses_t, biases_t, actions, rewards, obses_tp1, biases_tp1, dones, weights) # Loss logging summary = tf.Summary(value=[ tf.Summary.Value(tag='weighted_error', simple_value=weighted_error) ]) learning_summary_writer.add_summary(summary, update_step_t) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if update_step_t % target_network_update_freq == 0: # Update target network periodically. update_target() update_step_t += 1 elif message.status == UPDATE_STATUS_SEND_WEIGHTS: q_func_vars = get_session().run(debug['q_func_vars']) weights_queue.put(q_func_vars) elif message.status == UPDATE_STATUS_FINISH: should_finish = True else: logger.log(f'Unknown status in UpdateMessage: {message.status}')