def make_obs_ph(name): obs_shape = env.observation_space.shape # if flatten_obs: # flattened_env_shape = 1 # for dim_size in env.observation_space.shape: # flattened_env_shape *= dim_size # obs_shape = (flattened_env_shape,) return U.BatchInput(obs_shape, name=name)
def _observation_ph_generator(self, name): env = self.env if isinstance(env.observation_space, (MultiBinary, Discrete)): batch_shape = (env.observation_space.n, ) elif isinstance(env.observation_space, Box): batch_shape = env.observation_space.shape else: raise ValueError("Unexpected observation space") return tf_util.BatchInput(batch_shape, name=name)
def _build_model(self): sess = U.get_session() if sess is None: sess = U.make_session(8) sess.__enter__() self.act, self.train, self.update_target, self.debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(shape=[2, self.state_size], name=name), q_func=self.model2, num_actions=self.action_size, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), scope=self.scope, double_q=True, param_noise=True) # 初始化tf环境 U.initialize() self.update_target()
def make_obs_ph(name): return U.BatchInput((32, 32), name=name)
def main(): with U.make_session(8): env = gym.make("Pendulum-v0") act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): env.render() # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: # Show off the result env.render() else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular( "mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular()
return info['rewards'] if __name__ == '__main__': with U.make_session(4) as sess: args = parse_args() env, _ = dqn_core.make_env_atari(args.env) if args.random_action > 0: env = dqn_core.ActionRandomizer(env, args.random_action) model_parent_path = dqn_core.parent_path(args.model_dir) old_args = json.load(open(model_parent_path + '/args.json')) var_func, cvar_func = dqn_core.models.atari_model() act = dqn_core.build_act(make_obs_ph=lambda name: U.BatchInput( env.observation_space.shape, name=name), var_func=var_func, cvar_func=cvar_func, num_actions=env.action_space.n, nb_atoms=old_args['nb_atoms']) U.load_state(os.path.join(args.model_dir, "saved")) rewards = run(env, act, args.stochastic, args.nb_episodes) print('---------------------') for alpha in np.arange(0.05, 1.05, 0.05): v, cv = var_cvar_from_samples(rewards, alpha) print('CVaR_{:.2f} = {}'.format(alpha, cv))
def make_obs_ph(name): return U.BatchInput((16, 16), name=name)
def make_obs_ph(name): return U.BatchInput((84,84,4), name=name)
def main(_): print("Used flags:", FLAGS) config = configparser.ConfigParser() config.read(FLAGS.config_file) timer = time.time() ps_hosts = FLAGS.ps_hosts.split(",") if FLAGS.ps_hosts else config.get(FLAGS.config, 'ps_hosts').split(",") worker_hosts = FLAGS.worker_hosts.split(",") if FLAGS.worker_hosts else config.get(FLAGS.config, 'worker_hosts').split(",") job = FLAGS.job_name task = FLAGS.task_index learning_rate = config.getfloat(FLAGS.config, 'learning_rate') batch_size = config.getint(FLAGS.config, 'batch_size') memory_size = config.getint(FLAGS.config, 'memory_size') target_update = config.getint(FLAGS.config, 'target_update') seed = FLAGS.seed if FLAGS.seed else config.getint(FLAGS.config, 'seed') max_comm_rounds = config.getint(FLAGS.config, 'comm_rounds') epochs = config.getint(FLAGS.config, 'start_epoch') end_epoch = config.getint(FLAGS.config, 'end_epoch') epoch_decay = config.getint(FLAGS.config, 'epoch_decay') # epoch_decay_rate = (epochs - end_epoch) / epoch_decay epoch = LinearSchedule(epoch_decay, end_epoch, epochs) backup = config.getint(FLAGS.config, 'backup') # unused in async sync = config.getboolean(FLAGS.config, 'sync') gradient_prio = False if not sync else config.getboolean(FLAGS.config, 'gradient_prio') sync_workers = len(worker_hosts)-backup mute = FLAGS.mute if FLAGS.mute else config.getboolean(FLAGS.config, 'mute') animate = 0 draw = 0 print("Config:\nps_hosts={}\nworker_hosts={}\njob_name={}\ntask_index={}\nlearning_rate={}\n" "batch_size={}\nmemory_size={}\ntarget_update={}\nseed={}\ncomm_rounds={}\nepochs={}\n" "end_epoch={}\nepoch_decay={}\nnbackup={}\nsync={}" .format(ps_hosts, worker_hosts, job, task, learning_rate, batch_size, memory_size, target_update, seed, max_comm_rounds, epochs, end_epoch, epoch_decay, backup, sync)) cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) chief = True if job == 'worker' and task == 0 else False print("/job:", job, "/task:", task, " - Chief: ", chief, sep='') # Create server server = tf.train.Server(cluster, job_name=job, task_index=task) run_code = "{}-{}-p-{}-w-{}-E-{}-b-{}-m-{}-N-{}-lr-{}-B-{}-s-{}-".\ format(datetime.now().strftime("%y%m%d-%H%M%S"), env_name, len(ps_hosts), len(worker_hosts), epochs, batch_size, memory_size, target_update, learning_rate, backup, seed) run_code += "-sync" if sync else "-async" # Set a unique random seed for each client seed = ((seed * 10) + task) random.seed(seed) if not mute: print("Run code:", run_code) # Start parameter servers if job == 'ps': server.join() # Start training with U.make_session(num_cpu=4, target=server.target) as sess: # Create the environment env = gym.make(env_name) env.seed(seed) tf.set_random_seed(seed) # Create all the functions necessary to train the model act, train, global_opt, update_target, update_weights, sync_opt, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate), chief=chief, server=server, workers=sync_workers ) # Create the replay buffer replay_buffer = ReplayBuffer(memory_size) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) if not chief: if not mute: print("Worker {}/{} will sleep (3s) for chief to initialize variables".format(task+1, len(worker_hosts))) time.sleep(4) # Initialize the parameters and copy them to the target network. U.initialize(chief=chief) if chief: sess.run(debug['run_code'].assign(run_code)) if not mute: print("Set global run code to:", run_code) if not mute: print("initialized variables, sleeping for 1 sec") time.sleep(2) if not chief: while not sess.run(tf.is_variable_initialized(debug['run_code'])): if not mute: print("Global run code not yet initialized") time.sleep(2) run_code = str(sess.run(debug['run_code']).decode()) if run_code == '': if not mute: print("Run code empty. Trying to fetch again...") time.sleep(5) if not mute: print("Read global run code:", run_code) run_code += "(w" + str(task) + ")" print("Final run_code:", run_code) t_global_old = update_weights()[0][0] update_target() exp_gen = 1000 # For how many timesteps sould we only generate experience (not train) t_start = exp_gen comm_rounds = 0 comm_rounds_global = 0 dt = 0 write_csv(run_code, log=["episode", "reward" + str(task), "avg_reward" + str(task), "t_global", "cr"]) episode_rewards = [0.0] cr_reward = 0 obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew cr_reward += rew # Animate every <animate> episodes if not mute and chief and animate > 0 and (len(episode_rewards) % animate) == 0: if done: print("ep", len(episode_rewards), "ended with reward:", episode_rewards[-1]) env.render() if done: if not mute and chief and draw > 0 and len(episode_rewards) % draw == 0: env.render() avg_rew = np.round(np.mean(np.array(episode_rewards[-100:])), 1) write_csv(run_code, [len(episode_rewards), episode_rewards[-1], avg_rew, debug['t_global']()[0], comm_rounds_global]) obs = env.reset() episode_rewards.append(0) [converged] = sync_opt['check_converged']() is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= max_reward or converged if is_solved or comm_rounds >= max_comm_rounds: sync_opt['set_converged']([True]) if not mute: print("Converged was set to", sync_opt['check_converged']()[0]) write_csv_final(run_code, str(len(episode_rewards)), worker_hosts, chief, comm_rounds_global, mute) print("Converged after: ", len(episode_rewards), "episodes") print("Agent total steps:", t) print("Global steps: ", debug['t_global']()[0]) sec = round(time.time() - timer) print("Total time:", sec // 3600, "h", (sec % 3600) // 60, "min", sec % 60, "s") return else: if t >= exp_gen: # if t >= batch_size: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) td_error = train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if t - t_start >= np.round(epoch.value(comm_rounds)): cr_old = comm_rounds_global # Apply gradients to weights in PS if sync: # Tell the ps we are done and want to submit score [[comm_rounds_global], [worker_count]] = sync_opt['request_submit']() if comm_rounds_global == comm_rounds: if worker_count <= sync_workers: # If allowed to submit score, do it [comm_rounds_global] = sync_opt['submit_score']([cr_reward]) if chief: [submits] = sync_opt['set_submit']([0]) while worker_count != sync_workers: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_wc") break worker_count = sync_opt['check_wc']()[0] while sync_opt['check_submit']()[0] == -1: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_submit") break pass if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Continuing before submit") continue # Now all eligible workers have sent their score and gradient round has started # Submit gradient # TODO 4th argument overrides everything else unles it is set to -1 in the code [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old], [cr_reward], [1/len(worker_hosts)], [True]) submits = sync_opt['inc_submit']() if chief: while not sync_opt['check_submit']()[0] == sync_workers: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_submit (chief)") break pass # print("Round", comm_rounds, "finished") [w] = sync_opt['reset_wc']()[0] # print("Worker count reset to:", w) sync_opt['reset_score']() submits = sync_opt['set_submit']([-1]) # print("Submit round finished. Submits set to:", submits[0]) [r] = sync_opt['inc_comm_round']()[0] # print("New round started:", r) # Normal workers wait until GCR > CR if not chief: while sync_opt['check_round']()[0] <= comm_rounds: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_round") break # print("Worker submitted, waiting for next round:", comm_rounds + 1) # time.sleep(0.1) pass else: #elif worker_count > sync_workers: # If not allowed to submit score, wait for next round to start if not mute: print("Worker finished too late but before new round started (", comm_rounds_global, ")") print("WC(", worker_count, ") > N(", sync_workers, ")", sep="") target = np.floor(comm_rounds_global + 1) # +1 if x.0, +0.5 if x.5 while not sync_opt['check_round']()[0] >= target: pass elif comm_rounds_global > comm_rounds: # This means the worker is behind. Do nothing and start next round if not mute: print("Communication round ", comm_rounds, "missed. Actual round:", comm_rounds_global) # TODO How to handle round count when skipping rounds? comm_rounds = comm_rounds_global - 1 elif comm_rounds_global < comm_rounds: print("WARNING! Worker ahead of global:", comm_rounds, ">", comm_rounds_global) time.sleep(5) else: sync_opt['inc_comm_round']() [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old], [0], [-1], [False]) # Update the local weights with the new global weights from PS t_global_old = update_weights()[0][0] comm_rounds += 1 # print("Round finished. Increasing local comm_round to:", comm_rounds) cr_reward = 0 # TODO RE-ENABLE comm-rounds LOGGING # write_csv(run_code, [comm_rounds, t, dt, epoch.value(comm_rounds)], comm_rounds=True) t_start = t if t % target_update == 0: update_target() if not mute and done and len(episode_rewards) % 10 == 0: last_rewards = episode_rewards[-101:-1] logger.record_tabular("steps", t) logger.record_tabular("global steps", debug['t_global']()[0]) logger.record_tabular("communication rounds", comm_rounds) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", np.round(np.mean(episode_rewards[-101:-1]), 4)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.record_tabular("last gradient factor", np.round(factor, 4)) logger.dump_tabular() rew_ill = ['●' if x >= max_reward else str(int(np.floor(x / (max_reward/10)))) if x >= (max_reward/10) else '_' for x in last_rewards] streak = 0 for i in reversed(rew_ill): if i == "●": streak += 1 else: break #print("[" + ''.join(rew_ill) + "] ([● " + str(rew_ill.count('●')) + " | " + str(rew_ill.count('9')) + " | " + str(rew_ill.count('8')) + " | " + str(rew_ill.count('7')) + " | " + str(rew_ill.count('6')) + " | " + str(rew_ill.count('5')) + " | " + str(rew_ill.count('4')) + " | " + str(rew_ill.count('3')) + " | " + str(rew_ill.count('2')) + " | " + str(rew_ill.count('1')) + " | " + str(rew_ill.count('_')) + " _]/" + str(len(rew_ill)) + " {S:" + str(streak) + "})", sep='')
def make_obs_ph(name): return U.BatchInput((64, 64), name=name)
def make_placeholder(name): """Make a placeholder input.""" return tf_util.BatchInput(env.observation_space.shape, name=name)
def make_obs_ph(name): import dqn.tf_util as U return U.BatchInput(observation_shape, name=name)
def make_obs_ph(name): return U.BatchInput((observation_space_shape[0] + env_transfer.observation_space.shape[0], ), name=name)
def make_obs_ph(name): return U.BatchInput(env.observation_spec()["screen"], name=name)
def make_obs_ph(name): return U.BatchInput((num_actions, num_actions), name=name)
def make_obs_ph(name): return U.BatchInput(observation_space_shape, name=name)
def make_obs_ph(name): return U.BatchInput((env.observation_space.shape[0] * 2, ), name=name)
def evaluate(self, num_episodes, render=False): with U.make_session(NUM_CORES): self.t0 = time.time() env = self.env.env # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4) ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() self.episode_count += 1 state = env.reset() self.scores = [0.0] episode_q = [] for t in itertools.count(): action = act(state[None], update_eps=exploration.value(t))[0] observation, reward, done, _ = env.step(action) replay_buffer.add(state, action, reward, observation, float(done)) state = observation self.scores[-1] += reward episode_q.append(float(debug['q_values'](state[None]).max())) if render: env.render() if done: print('{0}, score: {1} ({2})'.format(len(self.scores), self.scores[-1], np.mean(self.scores[-100:]))) self.evaluation.info['q_values'].append(np.mean(episode_q)) if len(self.scores) >= num_episodes: return self.final_evaluation() state = env.reset() episode_q = [] self.scores.append(0) if self.env.solved(self.scores): self.evaluation.info['solved'] = len(self.scores) # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() U.reset() return self.final_evaluation()
# out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu) out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu) out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) # out = layers.layer_norm(out, center=True, scale=True) return out if __name__ == '__main__': with U.make_session(8): # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), param_noise=False ) # Create the replay buffer replay_buffer = PrioritizedReplayBuffer(50000, alpha=0.6) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target()
compass_channel /= 180.0 return np.concatenate([pov, compass_channel], axis=-1) if __name__ == '__main__': with U.make_session(8): # Create the environment env = gym.make("MineRLNavigateDense-v0") spaces = env.observation_space.spaces['pov'] shape = list(spaces.shape) shape[-1] += 1 # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(shape, name=name), q_func=model, num_actions=4, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(30000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=100000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target()