def make_env(): env = gym.make(env_id) if record_video: video_path = os.path.join(output_dir, 'video') ensure_dir(video_path) env = Monitor(env, video_path, video_callable=lambda episode_id: episode_id % record_video_freq == 0, force=True) return env
def _thunk(): env = make_atari(env_id) env.seed(seed + rank) if record_video: video_path = os.path.join(output_dir, 'video/env-%d' % rank) ensure_dir(video_path) env = Monitor(env, video_path, video_callable=lambda episode_id: episode_id % record_video_freq == 0, force=True) return wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False)
def make_env(): env = gym.make(env_id) if record_video: print("RECORDING VIDEO") video_path = os.path.join(output_dir, 'video') ensure_dir(video_path) env = Monitor(env, video_path, video_callable= lambda episode_id: episode_id % record_video_freq == 0, force=True) # env.render() return env
def run(env_id, model_path, record_video, video_path=None): env = make_atari(env_id) env = wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False) num_env = 1 ob_space = env.observation_space ac_space = env.action_space obs = np.zeros((num_env, 84, 84, 4), dtype=np.uint8) next_obs = env.reset() obs = update_obs(obs, next_obs) ep = 1 steps = 0 total_reward = 0 with tf.Session() as sess: print('Loading Model %s' % model_path) policy = CnnPolicy(sess, ob_space, ac_space, nbatch=1, nsteps=1) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(model_path) saver.restore(sess, ckpt.model_checkpoint_path) ts = ts_rand() if record_video: ensure_dir(video_path) video_recorder = VideoRecorder(env, path=ep_video_path( video_path, ts, env_id, ep)) while True: env.render() if record_video: video_recorder.capture_frame() actions, values, _ = policy.step(obs) value = values[0] steps += 1 next_obs, rewards, dones, info = env.step(actions) total_reward += rewards print('%d: reward=%f value=%f' % (steps, total_reward, value)) obs = update_obs(obs, next_obs) if dones: print('DONE') ep += 1 steps = 0 total_reward = 0 next_obs = env.reset() obs = np.zeros((num_env, 84, 84, 4), dtype=np.uint8) obs = update_obs(obs, next_obs) if record_video: video_recorder.close() video_recorder = VideoRecorder(env, path=ep_video_path( video_path, ts, env_id, ep), enabled=record_video) '`' time.sleep(2)
def learn(env, env_id, num_env, total_timesteps, output_dir, cuda_visible_devices, gpu_memory_fraction, load_model): valid_actions = valid_atari_actions(env, env_id) num_actions = len(valid_actions) report_summary_freq = 100 save_model_freq = 2000 num_steps = 5 batch_size = num_env * num_steps model_path = os.path.join(output_dir, 'model') summary_path = os.path.join(output_dir, 'summary') ensure_dir(summary_path) ensure_dir(model_path) sess = create_session(cuda_visible_devices, gpu_memory_fraction) model = Model(sess, num_env, num_steps, num_actions, total_timesteps=total_timesteps) runner = Runner(env, num_env, model, valid_actions, num_steps=num_steps) saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(summary_path) if load_model: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(model_path) saver.restore(sess, ckpt.model_checkpoint_path) timesteps = 0 max_updates = total_timesteps // batch_size print("Number of updates: %d" % max_updates) for update in range(1, max_updates + 1): obs, rewards, actions, values = runner.run() timesteps = update * batch_size policy_loss, value_loss, policy_entropy, cur_lr = model.train( obs, rewards, actions, values) if update % report_summary_freq == 0 and update != 0: mean_reward = safe_mean(runner.running_rewards.copy()) mean_steps = safe_mean(runner.running_steps.copy()) mean_value = safe_mean(runner.running_values.copy()) print("Updates: %d" % update) print("Timesteps: %d" % timesteps) print("Learn rate: %f" % cur_lr) print("Policy loss: %f" % float(policy_loss)) print("Value loss: %f" % float(value_loss)) print("Running rewards: %s" % runner.running_rewards) print("Mean reward: %s" % mean_reward) print("Mean steps: %s" % mean_steps) print("Mean values: %s" % mean_value) train_summary = tf.Summary() train_summary.value.add(tag='Train/Timesteps', simple_value=timesteps) train_summary.value.add(tag='Train/Policy loss', simple_value=policy_loss) train_summary.value.add(tag='Train/Policy entropy', simple_value=policy_entropy) train_summary.value.add(tag='Train/Value loss', simple_value=value_loss) train_summary.value.add(tag='Train/Mean steps', simple_value=mean_steps) train_summary.value.add(tag='Train/Mean reward', simple_value=mean_reward) train_summary.value.add(tag='Train/Mean values', simple_value=mean_value) summary_writer.add_summary(train_summary, update) summary_writer.flush() if update % save_model_freq == 0 and update != 0: print('Save model') saver.save(sess, model_path + '/model-' + str(update) + '.ckpt')
def learn(policy, env, nsteps, sess, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=50, cuda_visible_devices='0', gpu_memory_fraction=0.5, output_dir=None, vec_normalize=None): # TODO DRY if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches model = Model(policy=policy, sess=sess, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) model_path = os.path.join(output_dir, 'model') summary_path = os.path.join(output_dir, 'summary') ensure_dir(summary_path) ensure_dir(model_path) saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(summary_path) tfirststart = time.time() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, ep_info = runner.run( ) mblossvals = [] inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) print('') print("nupdates", update) print("serial_timesteps", update * nsteps) print("total_timesteps", update * nbatch) print("fps", fps) print("explained_variance", float(ev)) print('mean_episode_reward', ep_info['ep_mean_reward']) print('mean_episode_length', ep_info['ep_mean_length']) print('time_elapsed', tnow - tfirststart) policy_loss = lossvals[0] value_loss = lossvals[1] policy_entropy = lossvals[2] approxkl = lossvals[3] clipfrac = lossvals[4] print("policy_loss", policy_loss) print("value_loss", value_loss) train_summary = tf.Summary() train_summary.value.add(tag='Train/Episode Reward', simple_value=ep_info['ep_mean_reward']) train_summary.value.add(tag='Train/Episode Length', simple_value=ep_info['ep_mean_length']) train_summary.value.add(tag='Train/FPS', simple_value=fps) train_summary.value.add(tag='Train/Policy Loss', simple_value=policy_loss) train_summary.value.add(tag='Train/Value Loss', simple_value=value_loss) summary_writer.add_summary(train_summary, update) summary_writer.flush() if update % save_interval == 0 and update != 0: print('Save model: %s' % model_path) saver.save( sess, os.path.join(model_path, 'model-' + str(update) + '.ckpt')) if vec_normalize: # save observation scaling inside VecNormalize vec_normalize.snapshot( os.path.join(model_path, 'vec_normalize-' + str(update) + '.pickle')) env.close()
def run(env_id, model_path, record_video, video_path=None): if env_id.startswith('Roboschool'): import roboschool gym_env = gym.make(env_id) def make_env(): return gym_env dummy_vec_env = DummyVecEnv([make_env]) vec_normalize = VecNormalize(dummy_vec_env) vec_env = vec_normalize ob_space = vec_env.observation_space ac_space = vec_env.action_space window = PygletWindow() obs = vec_env.reset() ep = 1 steps = 0 total_reward = 0 with tf.Session() as sess: policy = MlpPolicy(sess, ob_space, ac_space, nbatch=1, nsteps=1) print('Loading Model {}'.format(model_path)) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(model_path) print(model_path) print(ckpt) saver.restore(sess, ckpt.model_checkpoint_path) vec_norm_path = last_vec_norm_path(model_path) print('Loading VecNormalize state %s' % vec_norm_path) vec_normalize.restore(vec_norm_path) ts = ts_rand() if record_video: ensure_dir(video_path) video_recorder = VideoRecorder(gym_env, path=ep_video_path( video_path, ts, env_id, ep)) while True: actions, values, _ = policy.step(obs) img = gym_env.render("rgb_array") window.imshow(img) if record_video: video_recorder.capture_frame() if window.still_open == False: video_records.close() break value = values[0] steps += 1 obs, rewards, dones, info = vec_env.step(actions) total_reward += rewards print('%d: reward=%f value=%f total_reward=%f' % (steps, rewards[0], value, total_reward)) if dones[0]: print('Episode %d finished' % ep) ep += 1 steps = 0 total_reward = 0 window.close() window = PygletWindow() if record_video: video_recorder.close() video_recorder = VideoRecorder(gym_env, path=ep_video_path( video_path, ts, env_id, ep), enabled=record_video) obs = vec_env.reset() time.sleep(2)
def learn(env, sess, cuda_visible_devices, gpu_memory_fraction, output_dir): log_freq = 100 save_freq = 500 update_target_freq = 250 total_timesteps = int(50e6) batch_size = 32 # How many experiences to use for each training step. exploration_start_eps = 1.0 # Starting probability of random action exploration_final_eps = 0.02 # End probability of random action exploration_max_steps = 200000 # How many steps of training to reduce epsilon hidden_size = 256 pre_train_steps = 5000 nsteps = 4 # How often to perform a training step. gamma = 0.99 # Discount factor on the target Q-values render = False load_model = False model_path = os.path.join(output_dir, 'model') summary_path = os.path.join(output_dir, 'summary') video_path = os.path.join(output_dir, 'video') ensure_dir(summary_path) ensure_dir(model_path) ensure_dir(video_path) valid_actions = [2, 3] # TODO replace num_actions = len(valid_actions) mainQN = Qnetwork(hidden_size, num_actions, "main", add_summaries=True) targetQN = Qnetwork(hidden_size, num_actions, "target") saver = tf.train.Saver() # operation that copies a snapshot of the main network to the target network update_target_op = update_target("main", "target") replay_buffer = ReplayBuffer(50000) # Create an exploration schedule exploration = LinearSchedule(schedule_timesteps=exploration_max_steps, initial_p=exploration_start_eps, final_p=exploration_final_eps) with sess: sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter(summary_path) if load_model: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(model_path) saver.restore(sess, ckpt.model_checkpoint_path) print("Populating replay buffer") state = env.reset() i = 0 while (True): action = np.random.randint(0, num_actions) next_state, reward, done, _ = env.step(valid_actions[action]) replay_buffer.add(state, action, reward, next_state, float(done)) if done: state = env.reset() else: state = next_state i += 1 if i > pre_train_steps and done: break runner = Runner(sess, env, replay_buffer, num_actions, nsteps, exploration, mainQN, valid_actions) nupdates = total_timesteps // nsteps for update in range(1, nupdates + 1): runner.run() if update % update_target_freq == 0: sess.run(update_target_op) # Sample a batch of transitions from the replay buffer states, actions, rewards, next_states, dones = replay_buffer.sample( batch_size) # Calculate the maximizing action q-value for s_tp1 using 'Double Q-learning' # 1. Predict the action that maximizes the q-value for s_tp1 using the mainQN feed_dict = { mainQN.state_input: next_states # shape: (batch_size, 84, 84, 4) } Q1 = sess.run(mainQN.max_q_action, feed_dict=feed_dict) # shape: (batch_size,) # 2. Predict the q-values for s_tp1 using the targetQN feed_dict = {targetQN.state_input: next_states} Q2 = sess.run(targetQN.q_out, feed_dict=feed_dict) # (batch_size, 2) # 3. Get the maxiziming action q-value for s_tp1 by selecting the Q1 index from the Q2 array max_action_q = Q2[range(batch_size), Q1] # (batch_size,) # inverte the 'done' fields in the train_batch, e.g. 000010000 -> 111101111 # in 'done' transitions there are no future rewards and the update rule reduces to: target_q = reward inverted_done_indicator = -(dones - 1) # Calculate the target-q value, that is what we think is the correct q-value for s_t and the # selected action and is used to calculate the td-error. target_q = rewards + (gamma * max_action_q * inverted_done_indicator) # Update the mainQN feed_dict = { mainQN.state_input: states, mainQN.actions: actions, mainQN.target_q: target_q } _, summaries = sess.run([mainQN.train_op, mainQN.summaries], feed_dict) if update % log_freq == 0: print( 'update %d: mean_ep_reward=%f, mean_ep_length=%d, total_steps=%d' % (update, runner.mean_ep_reward or 0.0, runner.mean_ep_length or 0.0, runner.total_steps)) print('Report summaries') train_summary = tf.Summary() train_summary.value.add(tag='Train/Episode Reward', simple_value=runner.mean_ep_reward) train_summary.value.add(tag='Train/Episode Length', simple_value=runner.mean_ep_length) summary_writer.add_summary(train_summary, update) summary_writer.add_summary(summaries, update) summary_writer.flush() if update % save_freq == 0: print('Save model') saver.save(sess, model_path + '/model-' + str(update) + '.ckpt')