def __init__(self, game, sess, nb_actions, global_step): BaseAgent.__init__(self, game, sess, nb_actions, global_step) self.name = "SF_linear_agent" self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.algorithm) self.nb_action = nb_actions self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.episode_max_values = [] self.episode_min_values = [] self.episode_mean_returns = [] self.episode_max_returns = [] self.episode_min_returns = [] self.exploration = LinearSchedule(FLAGS.explore_steps, FLAGS.final_random_action_prob, FLAGS.initial_random_action_prob) self.summary_writer = tf.summary.FileWriter( os.path.join(FLAGS.summaries_dir, FLAGS.algorithm)) self.summary = tf.Summary() self.nb_states = game.nb_states self.q_net = SFLinearNetwork(nb_actions, self.nb_states, 'orig') self.target_net = SFLinearNetwork(nb_actions, self.nb_states, 'target') self.targetOps = self.update_target_graph('orig', 'target') self.probability_of_random_action = self.exploration.value(0)
def __init__(self, game, sess, nb_actions, global_step): BaseAgent.__init__(self, game, sess, nb_actions, global_step) self.name = "SF_agent" self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.algorithm) self.nb_states = self.env.nb_states if FLAGS.matrix_type == "incidence": self.sf_buffer = np.zeros( [self.nb_states * self.nb_states, self.nb_states]) else: self.sf_buffer = np.zeros([self.nb_states, self.nb_states]) self.seen_states = set() self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.episode_max_values = [] self.episode_min_values = [] self.episode_mean_returns = [] self.episode_max_returns = [] self.episode_min_returns = [] self.exploration = LinearSchedule(FLAGS.explore_steps, FLAGS.final_random_action_prob, FLAGS.initial_random_action_prob) self.summary_writer = tf.summary.FileWriter( os.path.join(FLAGS.summaries_dir, FLAGS.algorithm)) self.summary = tf.Summary() self.sf_table = np.zeros([self.nb_states, self.nb_states]) # self.q_net = SFNetwork(self.nb_actions, self.nb_states, 'orig') # self.target_net = SFNetwork(self.nb_actions, self.nb_states, 'target') # # self.targetOps = self.update_target_graph('orig', 'target') # self.probability_of_random_action = self.exploration.value(0)
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory # Create the replay buffer self.memory = PrioritizedReplayBuffer(BUFFER_SIZE, alpha=PRIORITIZED_REPLAY_ALPHA) if PRIORITIZED_REPLAY_BETA_ITERS is None: prioritized_replay_beta_iters = N_EPISODES self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=PRIORITIZED_REPLAY_BETA0, final_p=1.0) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.t = 0
def __init__(self, game, sess, nb_actions, global_step): BaseAgent.__init__(self, game, sess, nb_actions, global_step) self.name = "CategoricalDQN_agent" self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.algorithm) self.support = np.linspace(FLAGS.v_min, FLAGS.v_max, FLAGS.nb_atoms) self.delta_z = (FLAGS.v_max - FLAGS.v_min) / (FLAGS.nb_atoms - 1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.episode_max_values = [] self.episode_min_values = [] self.episode_mean_returns = [] self.episode_max_returns = [] self.episode_min_returns = [] self.exploration = LinearSchedule(FLAGS.explore_steps, FLAGS.final_random_action_prob, FLAGS.initial_random_action_prob) self.summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir, FLAGS.algorithm)) self.summary = tf.Summary() self.q_net = CategoricalDQNetwork(nb_actions, 'orig') self.target_net = CategoricalDQNetwork(nb_actions, 'target') self.targetOps = self.update_target_graph('orig', 'target') self.probability_of_random_action = self.exploration.value(0)
class DQNLinearAgent(BaseAgent): def __init__(self, game, sess, nb_actions, global_step): BaseAgent.__init__(self, game, sess, nb_actions, global_step) self.name = "DQN_linear_agent" self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.algorithm) self.nb_action = nb_actions self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.episode_max_values = [] self.episode_min_values = [] self.episode_mean_returns = [] self.episode_max_returns = [] self.episode_min_returns = [] self.exploration = LinearSchedule(FLAGS.explore_steps, FLAGS.final_random_action_prob, FLAGS.initial_random_action_prob) self.summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir, FLAGS.algorithm)) self.summary = tf.Summary() self.nb_states = game.nb_states self.q_net = DQLinearNetwork(nb_actions, self.nb_states, 'orig') self.target_net = DQLinearNetwork(nb_actions, self.nb_states, 'target') self.targetOps = self.update_target_graph('orig', 'target') self.probability_of_random_action = self.exploration.value(0) def train(self): minibatch = random.sample(self.episode_buffer, FLAGS.batch_size) rollout = np.array(minibatch) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] next_observations = rollout[:, 3] done = rollout[:, 4] state_features = np.identity(self.nb_states) target_actionv_values_evaled = self.sess.run(self.target_net.action_values, feed_dict={self.target_net.inputs: state_features[next_observations]}) target_actionv_values_evaled_max = np.max(target_actionv_values_evaled, axis=1) target_actionv_values_evaled_new = [] for i in range(FLAGS.batch_size): if done[i]: target_actionv_values_evaled_new.append(rewards[i]) else: target_actionv_values_evaled_new.append( rewards[i] + FLAGS.gamma * target_actionv_values_evaled_max[i]) feed_dict = {self.q_net.target_q: target_actionv_values_evaled_new, self.q_net.inputs: state_features[observations], self.q_net.actions: actions} l, _, ms, returns = self.sess.run( [self.q_net.action_value_loss, self.q_net.train_op, self.q_net.merged_summary, self.q_net.action_values], feed_dict=feed_dict) # self.updateTarget() return l / len(rollout), ms, returns def updateTarget(self): for op in self.targetOps: self.sess.run(op) def eval(self, saver): self.saver = saver total_steps = 0 episode_rewards = [] print("Starting eval agent") with self.sess.as_default(), self.graph.as_default(): while total_steps < FLAGS.test_episodes: episode_reward = 0 episode_step_count = 0 d = False s = self.env.get_initial_state() while not d: a = self.policy_evaluation_eval(s) s1, r, d, info = self.env.step(a) r = np.clip(r, -1, 1) episode_reward += r episode_step_count += 1 s = s1 print("Episode reward was {}".format(episode_reward)) episode_rewards.append(episode_reward) total_steps += 1 print("Mean reward is {}".format(np.mean(np.asarray(episode_rewards)))) def play(self, saver): self.saver = saver train_stats = None # self.episode_count = self.sess.run(self.global_episode) self.total_steps = self.sess.run(self.global_step) if self.total_steps == 0: self.updateTarget() print("Starting agent") _t = {'episode': Timer(), "step": Timer()} with self.sess.as_default(), self.graph.as_default(): while self.total_steps < FLAGS.max_total_steps: _t["episode"].tic() if self.total_steps % FLAGS.target_update_freq == 0: self.updateTarget() episode_reward = 0 episode_step_count = 0 q_values = [] d = False # self.probability_of_random_action = self.exploration.value(self.total_steps) s = self.env.get_initial_state() while not d: _t["step"].tic() a, max_action_values_evaled = self.policy_evaluation(s) if max_action_values_evaled is not None: q_values.append(max_action_values_evaled) s1, r, d = self.env.step(a) r = np.clip(r, -1, 1) episode_reward += r episode_step_count += 1 self.total_steps += 1 self.episode_buffer.append([s, a, r, s1, d]) s = s1 if len(self.episode_buffer) == FLAGS.memory_size: self.episode_buffer.popleft() if self.total_steps > FLAGS.observation_steps and len( self.episode_buffer) > FLAGS.observation_steps and self.total_steps % FLAGS.update_freq == 0: l, ms, returns = self.train() train_stats = l, ms, returns _t["step"].toc() self.sess.run(self.increment_global_step) if episode_step_count == 23: d = True self.add_summary(episode_reward, episode_step_count, q_values, train_stats) _t["episode"].toc() print('Avg time per step is {:.3f}'.format(_t["step"].average_time())) print('Avg time per episode is {:.3f}'.format(_t["episode"].average_time())) # fps = self.total_steps / _t['Total'].duration # print('Average time per episod is {}'.format(_t['episode'].average_time)) def add_summary(self, episode_reward, episode_step_count, q_values, train_stats): self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) if len(q_values): self.episode_mean_values.append(np.mean(np.asarray(q_values))) self.episode_max_values.append(np.max(np.asarray(q_values))) self.episode_min_values.append(np.min(np.asarray(q_values))) if self.total_steps % FLAGS.summary_interval == 0 and self.total_steps != 0 and self.total_steps > FLAGS.observation_steps: if self.total_steps % FLAGS.checkpoint_interval == 0: self.save_model(self.saver, self.total_steps) l, ms, returns = train_stats self.episode_mean_returns.append(np.mean(np.asarray(returns))) self.episode_max_returns.append(np.max(np.asarray(returns))) self.episode_min_returns.append(np.min(np.asarray(returns))) mean_reward = np.mean(self.episode_rewards[-FLAGS.summary_interval:]) mean_length = np.mean(self.episode_lengths[-FLAGS.summary_interval:]) mean_value = np.mean(self.episode_mean_values[-FLAGS.summary_interval:]) max_value = np.mean(self.episode_max_values[-FLAGS.summary_interval:]) min_value = np.mean(self.episode_min_values[-FLAGS.summary_interval:]) mean_return = np.mean(self.episode_mean_returns[-FLAGS.summary_interval:]) max_return = np.mean(self.episode_max_returns[-FLAGS.summary_interval:]) min_return= np.mean(self.episode_min_returns[-FLAGS.summary_interval:]) # if episode_count % FLAGS.test_performance_interval == 0: # won_games = self.episode_rewards[-FLAGS.test_performance_interval:].count(1) # self.summary.value.add(tag='Perf/Won Games/1000', simple_value=float(won_games)) self.summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) self.summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) self.summary.value.add(tag='Perf/Value_Mean', simple_value=float(mean_value)) self.summary.value.add(tag='Perf/Value_Max', simple_value=float(max_value)) self.summary.value.add(tag='Perf/Value_Min', simple_value=float(min_value)) self.summary.value.add(tag='Perf/Return_Mean', simple_value=float(mean_return)) self.summary.value.add(tag='Perf/Return_Max', simple_value=float(max_return)) self.summary.value.add(tag='Perf/Return_Min', simple_value=float(min_return)) self.summary.value.add(tag='Perf/Probability_random_action', simple_value=float(self.probability_of_random_action)) self.summary.value.add(tag='Losses/Loss', simple_value=float(l)) self.write_summary(ms, None) def policy_evaluation(self, s): action_values_evaled = None self.probability_of_random_action = self.exploration.value(self.total_steps) if random.random() <= self.probability_of_random_action: a = np.random.choice(range(self.nb_actions)) else: state_features = np.identity(self.nb_states) feed_dict = {self.q_net.inputs: state_features[s:s+1]} action_values_evaled = self.sess.run(self.q_net.action_values, feed_dict=feed_dict)[0] a = np.argmax(action_values_evaled) return a, np.max(action_values_evaled) def policy_evaluation_eval(self, s): feed_dict = {self.q_net.inputs: [s]} action_values_evaled = self.sess.run(self.q_net.action_values, feed_dict=feed_dict)[0] a = np.argmax(action_values_evaled) return a
def run(**kwargs): ''' Setup TF, gym environment, etc. ''' iterations=kwargs['iterations'] discount=kwargs['discount'] batch_size=kwargs['batch_size'] num_batches=kwargs['num_batches'] max_seq_length=kwargs['max_seq_length'] learning_rate=kwargs['learning_rate'] animate=kwargs['animate'] logdir=kwargs['logdir'] seed=kwargs['seed'] games_played_per_epoch=kwargs['games_played_per_epoch'] load_model = False mcts_iterations=kwargs['mcts_iterations'] batches_per_epoch=kwargs['batches_per_epoch'] headless=kwargs['headless'] update_freq=kwargs['update_freq'] buffer_size=kwargs['buffer_size'] if headless: import matplotlib ################################################################ # SEEDS ################################################################ tf.set_random_seed(seed) np.random.seed(seed) ################################################################ # SETUP GYM + RL ALGO ################################################################ env = gym.make('snake-v0') # Make the gym environment maximum_number_of_steps = max_seq_length #or env.max_episode_steps # Maximum length for episodes ################################################################ # TF BOILERPLATE ################################################################ tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) summary_writers = [] for idx in np.arange(env.n_actors): summary_writers.append(tf.summary.FileWriter(os.path.join(logdir,'tensorboard','snake_%s' % idx) )) summary_writers.append(tf.summary.FileWriter(os.path.join(logdir,'tensorboard','training_stats') )) def rgb2gray(rgb): return np.dot(rgb[...,:3], [0.299, 0.587, 0.114]) with tf.Session() as sess: network = DQN( sess, create_basic([16,16,64], transpose=True), [1,env.world.screen_width,env.world.screen_height], summary_writers[-1], n_actions=4, batch_size=batch_size, gamma=.99, update_freq=update_freq, ddqn=True, # double dqn buffer_size = buffer_size, clip_grad = None, batches_per_epoch = batches_per_epoch, is_sparse = False ) monitor = Monitor(os.path.join(logdir,'gifs')) epsilon_schedule = LinearSchedule(iterations*9/10, 1.0, 0.01) learning_rate_schedule = PiecewiseSchedule([(0,1e-3),(20000,5e-4),(50000,1e-4)], outside_value=1e-4) saver = tf.train.Saver(max_to_keep=2) # summary_writer = tf.summary.FileWriter(logdir) ## Load model from where you left off ## Does not play nice w/ plots in tensorboard at the moment ## TODO: FIX if load_model == True: try: print ('Loading Model...') ckpt = tf.train.get_checkpoint_state(logdir) saver.restore(sess,ckpt.model_checkpoint_path) iteration_offset = int(ckpt.model_checkpoint_path.split('-')[-1].split('.')[0]) except: print ('Failed to load. Starting from scratch') sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iteration_offset = 0 else: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iteration_offset = 0 summary_writers[0].add_graph(sess.graph) ################################################################ # Fill Buffer ################################################################ tic = time.time() total_timesteps = 0 while not network.buffer.full(N=buffer_size/2): network.buffer.games_played += 1 print 'Game number: %s. Buffer_size: %s' % (network.buffer.games_played, network.buffer.buffer_size) _ = env.reset() obs = env.render('rgb_array', headless = headless).astype(float) obs /= obs.max() obs = rgb2gray(obs) done_n = np.array([False]*env.n_actors) steps = 0 while not done_n.all(): last_obs = obs acts = network.greedy_select([[last_obs]], 1.) acts = [str(x) for x in acts] # Next step _, reward_n, done_n = env.step(acts[-1]) obs = env.render('rgb_array', headless = headless).astype(float) obs /= obs.max() obs = rgb2gray(obs) steps += 1 network.store(np.array([[last_obs]]), # state np.array(acts), # action np.array(reward_n), #rewards np.array([[obs]]), #new state np.array(done_n) #done ) if steps > maximum_number_of_steps: done_n[:] = True print 'Filled Buffer' ################################################################ # Train Loop ################################################################ network.buffer.soft_reset() total_number_of_steps_in_iteration = 0 for iteration in range(iteration_offset, iteration_offset + iterations): print('{0} Iteration {1} {0}'.format('*'*10, iteration)) timesteps_in_iteration = 0 if (iteration % update_freq == 0): saver.save(sess,os.path.join(logdir,'model-'+str(iteration)+'.cptk')) print "Saved Model. Timestep count: %s" % iteration total_reward = np.array([0]*env.n_actors) while True: network.buffer.games_played += 1 if (((network.buffer.games_played) % 10) == 0): print 'Epoch: %s. Game number: %s' % (iteration, network.buffer.games_played) _ = env.reset() rgb = obs = env.render('rgb_array', headless = headless).astype(float) obs /= obs.max() obs = rgb2gray(obs) animate_episode = (iteration % (update_freq) == 0) and animate done_n = np.array([False]*env.n_actors) steps = 0 # Runs policy, collects observations and rewards viewer = None while not done_n.all(): if animate_episode: if (not viewer) and (not headless): from gym.envs.classic_control import rendering viewer = rendering.SimpleImageViewer() rgb = env.render('rgb_array', headless = headless) scaler = 10 rgb=repeat_upsample(rgb,scaler,scaler) if not headless: viewer.imshow(rgb) time.sleep(.01) monitor.add(rgb, iteration, network.buffer.games_played) # ob = get_data(np.array(raw_observations)[-2:]) last_obs = obs # Control the exploration acts = network.greedy_select([[last_obs]], epsilon_schedule.value(network.epoch)) # epsilon greedy acts = [str(x) for x in acts] # Next step _, reward_n, done_n = env.step(acts[-1]) obs = env.render('rgb_array', headless = headless).astype(float) obs /= obs.max() obs = rgb2gray(obs) total_reward += np.array(reward_n) if total_number_of_steps_in_iteration % 4 == 0: network.train_step(learning_rate_schedule) total_number_of_steps_in_iteration += 1 steps += 1 network.store(np.array([[last_obs]]), # state np.array(acts), # action np.array(reward_n), #rewards np.array([[obs]]), #new state np.array(done_n) #done ) # terminate the collection of data if the controller shows stability # for a long time. This is a good thing. if steps > maximum_number_of_steps: done_n[:] = True if viewer: viewer.close() if network.buffer.games_played >= 1: break monitor.make_gifs(iteration) for count, writer in enumerate(summary_writers): if count < (len(summary_writers) - 1): summary = tf.Summary() summary.value.add(tag='Average Reward', simple_value=(total_reward[count])) summary.value.add(tag='Steps Taken', simple_value=(steps)) writer.add_summary(summary, iteration) writer.flush()
def main(_): # create visualizer #visualizer = TensorboardVisualizer() monitor = Monitor(FLAGS) #log_dir = monitor.log_dir #visualizer.initialize(log_dir, None) saved_mean_reward = None # openAI logger L.configure(monitor.log_dir, format_strs=['stdout', 'csv']) # initialize env atari_env = AtariEnv(monitor) #screen_shot_subgoal(atari_env) # we should probably follow deepmind style env # stack 4 frames and scale float env = wrapper.wrap_deepmind(atari_env, frame_stack=True, scale=True) # get default tf_session sess = U.get_session() # create q networks for controller controller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) controller_network = Q_network(env.observation_space, env.action_space.n, controller_optimizer, scope='controller') controller = Controller(controller_network, env.action_space.n) # create q networks for meta-controller num_goals = env.unwrapped.goals_space.n metacontroller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) metacontroller_network = Q_network(env.observation_space, num_goals, metacontroller_optimizer, scope='metacontroller') metacontroller = MetaController(metacontroller_network, num_goals) # Create the schedule for exploration starting from 1. exploration2 = LinearSchedule(schedule_timesteps=int(EXPLORATION_FRACTION * monitor.num_timesteps), initial_p=1.0, final_p=EXPLORATION_FINAL_EPS) # initialize experience replay controller_replay_buffer = ReplayBuffer(D1_MEMORY_SIZE) metacontroller_replay_buffer = ReplayBuffer(D2_MEMORY_SIZE) # initialize critic critic = Critic(env.unwrapped) total_extrinsic_reward = [] # for success rate total_goal_reached = np.zeros(num_goals, dtype=np.int32) total_goal_sampled = np.zeros(num_goals, dtype=np.int32) total_goal_epsilon = np.ones(num_goals, dtype=np.float32) ep = 0 total_step = 0 init_ob = env.reset() U.initialize() # initialize target network in both controller and meta sess.run(metacontroller.network.update_target_op) sess.run(controller.network.update_target_op) # load ckpt if presence model_path = tf.train.latest_checkpoint(monitor.ckpt_dir) model_saved = False model_file = os.path.join(monitor.ckpt_dir, 'model') if model_path is not None: U.load_variables(model_file) L.log('loaded model from %s' % model_file) model_saved = True while ep < MAX_EPISODE: # count number of steps # init environment game play variables init_ob = env.reset() observation = np.reshape(init_ob['observation'], (1, )+init_ob['observation'].shape) desired_goal = metacontroller.sample_act(sess, observation, update_eps=1.0)[0] env.unwrapped.desired_goal = desired_goal total_goal_sampled[desired_goal] += 1 # given predicted goal, we encode this goal bounding mask to the observation np array ob_with_g = env.unwrapped._add_goal_mask(init_ob['observation'], desired_goal) # NOTE: Below code verify added mask correctly # for i in range(ob_with_g.shape[-1]): # ob = ob_with_g[:,:,i] # image = Image.fromarray(ob) # image = image.convert('RGB') # image.save('test_%i.png' % i) done = False reached_goal = False while not done: extrinsic_rewards = 0 s0 = init_ob['observation'] while not (done or reached_goal): update_eps1_with_respect_to_g = get_epsilon(total_goal_epsilon, total_goal_reached, total_goal_sampled, desired_goal, total_step, EXPLORATION_WARM_UP) ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape) primitive_action_t = controller.sample_act(sess, ob_with_g_reshaped, update_eps=update_eps1_with_respect_to_g)[0] # obtain extrinsic reward from environment ob_tp1, extrinsic_reward_t, done_t, info = env.step(primitive_action_t) reached_goal = env.unwrapped.reached_goal(desired_goal) ob_with_g_tp1 = env.unwrapped._add_goal_mask(ob_tp1['observation'], desired_goal) intrinsic_reward_t = critic.criticize(desired_goal, reached_goal, primitive_action_t, done_t) controller_replay_buffer.add(ob_with_g, primitive_action_t, intrinsic_reward_t, ob_with_g_tp1, done_t) # sample from replay_buffer1 to train controller obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t = controller_replay_buffer.sample(TRAIN_BATCH_SIZE) weights, batch_idxes = np.ones_like(intrinsic_rewards_t), None # get q estimate for tp1 as 'supervised' ob_with_g_tp1_reshaped = np.reshape(ob_with_g_tp1, (1, )+ob_with_g.shape) q_tp1 = controller.get_q(sess, ob_with_g_tp1_reshaped)[0] td_error = controller.train(sess, obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t, weights, q_tp1) # join train meta-controller only sample from replay_buffer2 to train meta-controller if total_step >= WARMUP_STEPS: L.log('join train has started ----- step %d', total_step) # sample from replay_buffer2 to train meta-controller init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t = metacontroller_replay_buffer.sample(TRAIN_BATCH_SIZE) weights, batch_idxes = np.ones_like(extrinsic_rewards_t), None # get q estimate for tp1 as 'supervised' obs_terminate_in_g_reshaped = np.reshape(obs_terminate_in_g, (1, )+obs_terminate_in_g.shape) q_tp1 = metacontroller.get_q(sess, obs_terminate_in_g_reshaped)[0] td_error = metacontroller.train(sess, init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t, weights, q_tp1) if total_step % UPDATE_TARGET_NETWORK_FREQ == 0: #L.log('UPDATE BOTH CONTROLLER Q NETWORKS ----- step %d', step) sess.run(controller.network.update_target_op) # its fine, we aren't really training meta dqn until after certain steps. sess.run(metacontroller.network.update_target_op) extrinsic_rewards += extrinsic_reward_t ob_with_g = ob_with_g_tp1 done = done_t total_step += 1 # we are done / reached_goal # store transitions of init_ob, goal, all the extrinsic rewards, current ob in D2 # print("ep %d : step %d, goal extrinsic total %d" % (ep, step, extrinsic_rewards)) # clean observation without goal encoded metacontroller_replay_buffer.add(init_ob['observation'], desired_goal, extrinsic_rewards, ob_tp1['observation'], done) # if we are here then we have finished the desired goal if not done: #print("ep %d : goal %d reached, not yet done, extrinsic %d" % (ep, desired_goal, extrinsic_rewards)) exploration_ep = 1.0 total_goal_reached[env.unwrapped.achieved_goal] += 1 if total_step >= WARMUP_STEPS: t = total_step - WARMUP_STEPS exploration_ep = exploration2.value(t) ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape) while env.unwrapped.achieved_goal == desired_goal: desired_goal = metacontroller.sample_act(sess, ob_with_g_reshaped, update_eps=exploration_ep)[0] env.unwrapped.desired_goal = desired_goal total_goal_sampled[desired_goal] += 1 L.log('ep %d : achieved goal was %d ----- new goal --- %d' % (ep, env.unwrapped.achieved_goal, desired_goal)) # start again reached_goal = False # finish an episode total_extrinsic_reward.append(extrinsic_rewards) ep += 1 mean_100ep_reward = round(np.mean(total_extrinsic_reward[-101:-1]), 1) if ep % monitor.print_freq == 0 : L.record_tabular("steps", total_step) L.record_tabular("episodes", ep) L.record_tabular("mean 100 episode reward", mean_100ep_reward) L.dump_tabular() if total_step % monitor.ckpt_freq == 0: if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: L.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward # verified our model was saved if model_saved: L.log('restored model with mean reward: %d' % saved_mean_reward) U.load_variables(model_file)
class SFAgent(BaseAgent): def __init__(self, game, sess, nb_actions, global_step): BaseAgent.__init__(self, game, sess, nb_actions, global_step) self.name = "SF_agent" self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.algorithm) self.nb_states = self.env.nb_states if FLAGS.matrix_type == "incidence": self.sf_buffer = np.zeros( [self.nb_states * self.nb_states, self.nb_states]) else: self.sf_buffer = np.zeros([self.nb_states, self.nb_states]) self.seen_states = set() self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.episode_max_values = [] self.episode_min_values = [] self.episode_mean_returns = [] self.episode_max_returns = [] self.episode_min_returns = [] self.exploration = LinearSchedule(FLAGS.explore_steps, FLAGS.final_random_action_prob, FLAGS.initial_random_action_prob) self.summary_writer = tf.summary.FileWriter( os.path.join(FLAGS.summaries_dir, FLAGS.algorithm)) self.summary = tf.Summary() self.sf_table = np.zeros([self.nb_states, self.nb_states]) # self.q_net = SFNetwork(self.nb_actions, self.nb_states, 'orig') # self.target_net = SFNetwork(self.nb_actions, self.nb_states, 'target') # # self.targetOps = self.update_target_graph('orig', 'target') # self.probability_of_random_action = self.exploration.value(0) def train(self): minibatch = random.sample(self.episode_buffer, FLAGS.batch_size) rollout = np.array(minibatch) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] next_observations = rollout[:, 3] done = rollout[:, 4] state_features = np.identity(self.nb_states) target_sf_evaled = self.sess.run(self.target_net.sf, feed_dict={ self.target_net.features: state_features[next_observations] }) # target_sf_evaled_exp = np.mean(target_sf_evaled, axis=1) # gamma = np.tile(np.expand_dims(np.asarray(np.logical_not(done), dtype=np.int32) * FLAGS.gamma, 1), # [1, self.nb_states]) # # target_sf_evaled_new = state_features[next_observations] + gamma * target_sf_evaled_exp # feed_dict = { self.q_net.target_sf: target_sf_evaled, # self.q_net.target_reward: np.stack(rewards, axis=0), self.q_net.features: state_features[observations] } # self.q_net.actions: actions} sf_l, _, ms = self.sess.run( [ self.q_net.sf_loss, # self.q_net.reward_loss, # self.q_net.total_loss, self.q_net.train_op, self.q_net.merged_summary ], feed_dict=feed_dict) # self.updateTarget() return sf_l / len(rollout), ms def updateTarget(self): for op in self.targetOps: self.sess.run(op) def eval(self, saver): self.saver = saver total_steps = 0 episode_rewards = [] print("Starting eval agent") with self.sess.as_default(), self.graph.as_default(): while total_steps < FLAGS.test_episodes: episode_reward = 0 episode_step_count = 0 d = False s = self.env.get_initial_state() while not d: a = self.policy_evaluation_eval(s) s1, r, d, info = self.env.step(a) r = np.clip(r, -1, 1) episode_reward += r episode_step_count += 1 s = s1 print("Episode reward was {}".format(episode_reward)) episode_rewards.append(episode_reward) total_steps += 1 print("Mean reward is {}".format(np.mean(np.asarray(episode_rewards)))) def play(self, saver): self.saver = saver train_stats = None d = True # self.episode_count = self.sess.run(self.global_episode) self.total_steps = self.sess.run(self.global_step) # if self.total_steps == 0: # self.updateTarget() self.nb_episodes = 0 state_features = np.identity(self.nb_states) episode_reward = 0 episode_step_count = 0 q_values = [] td_error = 0 print("Starting agent") _t = {'episode': Timer(), "step": Timer()} with self.sess.as_default(), self.graph.as_default(): while self.total_steps < FLAGS.max_total_steps: if self.total_steps == 0 or d or episode_step_count % 30 == 0: _t["episode"].tic() # if self.total_steps % FLAGS.target_update_freq == 0: # self.updateTarget() self.add_summary(episode_reward, episode_step_count, q_values, train_stats) episode_reward = 0 episode_step_count = 0 q_values = [] if self.total_steps != 0: self.nb_episodes += 1 d = False # self.probability_of_random_action = self.exploration.value(self.total_steps) s = self.env.get_initial_state() _t["step"].tic() a, max_action_values_evaled = self.policy_evaluation(s) # if max_action_values_evaled is None: # q_values.append(0) # else: # q_values.append(max_action_values_evaled) s1, r, d = self.env.step(a) self.env.render() r = np.clip(r, -1, 1) episode_reward += r episode_step_count += 1 self.total_steps += 1 td_error = (state_features[s] + FLAGS.gamma * self.sf_table[s1]) - self.sf_table[s] q_values.append(td_error) # print(sum(td_error)) # self.episode_buffer.append([s, a, r, s1, d]) self.sf_table[s] = self.sf_table[s] + FLAGS.lr * td_error s = s1 # if len(self.episode_buffer) == FLAGS.memory_size: # self.episode_buffer.popleft() # if self.total_steps > FLAGS.observation_steps and len( # self.episode_buffer) > FLAGS.observation_steps and self.total_steps % FLAGS.update_freq == 0:# and FLAGS.task != "discover": # sf_l, ms = self.train() # train_stats = sf_l, ms if self.total_steps > FLAGS.nb_steps_sf: s, v = self.discover_options() # self.sf_buffer.popleft() # if self.total_steps > FLAGS.nb_steps_sf: # if FLAGS.matrix_type == "incidence": # self.construct_incidence_matrix() # else: # self.construct_successive_matrix() # self.add_successive_feature(s, a) _t["step"].toc() self.sess.run(self.increment_global_step) _t["episode"].toc() # print('Avg time per step is {:.3f}'.format(_t["step"].average_time())) # print('Avg time per episode is {:.3f}'.format(_t["episode"].average_time())) # fps = self.total_steps / _t['Total'].duration # print('Average time per episod is {}'.format(_t['episode'].average_time)) def construct_successive_matrix(self): for s in range(self.nb_states): state_features = np.identity(self.nb_states) sf_feat = self.sess.run( self.q_net.sf, feed_dict={self.q_net.features: state_features[s:s + 1]}) # a = np.random.choice(range(self.nb_actions)) # a_one_hot = np.zeros(shape=(1, self.nb_actions, self.nb_states), dtype=np.int32) # a_one_hot[0, a] = 1 # sf_feat_a = np.sum(np.multiply(sf_feat, a_one_hot), axis=1) self.sf_buffer[s] = sf_feat if s not in self.seen_states: self.seen_states.add(s) def construct_incidence_matrix(self): i = 0 for s in range(self.nb_states): for s1 in range(self.nb_states): state_features = np.identity(self.nb_states) sf_feat = self.sess.run( self.q_net.sf, feed_dict={self.q_net.features: state_features[s:s + 1]}) sf_feat1 = self.sess.run( self.q_net.sf, feed_dict={self.q_net.features: state_features[s1:s1 + 1]}) trans = state_features[s1:s1 + 1] - state_features[s:s + 1] self.sf_buffer[i] = trans i += 1 if s not in self.seen_states: self.seen_states.add(s) # def add_successive_feature(self, s, a): # state_features = np.identity(self.nb_states) # sf_feat = self.sess.run(self.q_net.sf, # feed_dict={self.q_net.features: state_features[s:s+1]}) # a_one_hot = np.zeros(shape=(1, self.nb_actions, self.nb_states), dtype=np.int32) # a_one_hot[0, a] = 1 # sf_feat_a = np.sum(np.multiply(sf_feat, a_one_hot), axis=1) # if s not in self.seen_states: # self.seen_states.add(s) # self.sf_buffer[s] = sf_feat_a def discover_options(self): sf_matrix = tf.convert_to_tensor(np.squeeze(np.array(self.sf_table)), dtype=tf.float32) s, u, v = tf.svd(sf_matrix) # discard noise, get first 10 # s = s[:10] # v = v[:10] if FLAGS.task == "discover": # Plotting all the basis plot = Visualizer(self.env) s_evaled, v_evaled = self.sess.run([s, v]) idx = s_evaled.argsort()[::-1] s_evaled = s_evaled[idx] v_evaled = v_evaled[:, idx] plot.plotBasisFunctions(s_evaled, v_evaled) guard = len(s_evaled) epsilon = 0 options = [] actionSetPerOption = [] for i in range(guard): idx = guard - i - 1 print('Solving for eigenvector #' + str(idx)) polIter = PolicyIteration(0.9, self.env, augmentActionSet=True) self.env.define_reward_function(v_evaled[:, idx]) V, pi = polIter.solvePolicyIteration() # Now I will eliminate any actions that may give us a small improvement. # This is where the epsilon parameter is important. If it is not set all # it will never be considered, since I set it to a very small value for j in range(len(V)): if V[j] < epsilon: pi[j] = len(self.env.get_action_set()) # if plotGraphs: plot.plotValueFunction(V[0:self.nb_states], str(idx) + '_') plot.plotPolicy(pi[0:self.nb_states], str(idx) + '_') options.append(pi[0:self.nb_states]) optionsActionSet = self.env.get_action_set() np.append(optionsActionSet, ['terminate']) actionSetPerOption.append(optionsActionSet) exit(0) return s, v def add_summary(self, episode_reward, episode_step_count, q_values, train_stats): self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) if len(q_values): self.episode_mean_values.append(np.mean(np.asarray(q_values))) self.episode_max_values.append(np.max(np.asarray(q_values))) self.episode_min_values.append(np.min(np.asarray(q_values))) if self.nb_episodes % FLAGS.summary_interval == 0 and self.nb_episodes != 0 and self.total_steps > FLAGS.observation_steps: if self.nb_episodes % FLAGS.checkpoint_interval == 0: self.save_model(self.saver, self.total_steps) mean_reward = np.mean( self.episode_rewards[-FLAGS.summary_interval:]) mean_length = np.mean( self.episode_lengths[-FLAGS.summary_interval:]) mean_value = np.mean( self.episode_mean_values[-FLAGS.summary_interval:]) max_value = np.mean( self.episode_max_values[-FLAGS.summary_interval:]) min_value = np.mean( self.episode_min_values[-FLAGS.summary_interval:]) self.summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) self.summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) self.summary.value.add(tag='Perf/Value_Mean', simple_value=float(mean_value)) self.summary.value.add(tag='Perf/Value_Max', simple_value=float(max_value)) self.summary.value.add(tag='Perf/Value_Min', simple_value=float(min_value)) self.summary.value.add(tag='Perf/Probability_random_action', simple_value=float( self.probability_of_random_action)) # if train_stats is not None: # sf_l, ms = train_stats # self.summary.value.add(tag='Losses/SF_Loss', simple_value=float(sf_l)) # self.summary.value.add(tag='Losses/R_Loss', simple_value=float(r_l)) # self.summary.value.add(tag='Losses/T_Loss', simple_value=float(t_l)) self.write_summary(None) def policy_evaluation(self, s): action_values_evaled = None self.probability_of_random_action = self.exploration.value( self.total_steps) # if random.random() <= self.probability_of_random_action: a = np.random.choice(range(self.nb_actions)) # else: # state_features = np.identity(self.nb_states) # feed_dict = {self.q_net.features: state_features[s:s+1]} # action_values_evaled = self.sess.run(self.q_net.q, feed_dict=feed_dict)[0] # # a = np.argmax(action_values_evaled) return a, 0 def policy_evaluation_eval(self, s): feed_dict = {self.q_net.inputs: [s]} action_values_evaled = self.sess.run(self.q_net.action_values, feed_dict=feed_dict)[0] a = np.argmax(action_values_evaled) return a
def run(**kwargs): ''' Setup TF, gym environment, etc. ''' iterations = kwargs['iterations'] discount = kwargs['discount'] batch_size = kwargs['batch_size'] num_batches = kwargs['num_batches'] max_seq_length = kwargs['max_seq_length'] learning_rate = kwargs['learning_rate'] animate = kwargs['animate'] logdir = kwargs['logdir'] seed = kwargs['seed'] games_played_per_epoch = kwargs['games_played_per_epoch'] load_model = False mcts_iterations = kwargs['mcts_iterations'] batches_per_epoch = kwargs['batches_per_epoch'] headless = kwargs['headless'] update_freq = kwargs['update_freq'] buffer_size = kwargs['buffer_size'] use_priority = kwargs['use_priority'] policy_batch_size = kwargs['policy_batch_size'] reservoir_buffer_size = kwargs['reservoir_buffer_size'] if headless: import matplotlib ################################################################ # SEEDS ################################################################ tf.set_random_seed(seed) np.random.seed(seed) ################################################################ # SETUP GYM + RL ALGO ################################################################ env = gym.make('snake-v1') # Make the gym environment maximum_number_of_steps = max_seq_length #or env.max_episode_steps # Maximum length for episodes ################################################################ # TF BOILERPLATE ################################################################ tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) summary_writers = [] for idx in np.arange(env.n_actors): summary_writers.append( tf.summary.FileWriter( os.path.join(logdir, 'tensorboard', 'snake_%s' % idx))) summary_writers.append( tf.summary.FileWriter( os.path.join(logdir, 'tensorboard', 'training_stats'))) with tf.Session() as sess: networks = [] for i in range(env.n_actors): networks.append( SelfPlay( sess, create_basic([64, 64, 256], transpose=True), [(env.n_actors) * 2 + 1, env.world.screen_width, env.world.screen_height], summary_writers[-1], n_actions=4, batch_size=batch_size, gamma=.99, update_freq=update_freq, ddqn=True, # double dqn buffer_size=buffer_size, clip_grad=None, batches_per_epoch=batches_per_epoch, is_sparse=True, use_priority=use_priority, _id=i, policy_batch_size=policy_batch_size, reservoir_buffer_size=reservoir_buffer_size)) monitor = Monitor(os.path.join(logdir, 'gifs')) epsilon_schedule = PiecewiseSchedule( [(0, .2), (50000, .05), (75000, .01)], outside_value=.01) #LinearSchedule(iterations*60/100, 1., 0.001) eta_schedule = PiecewiseSchedule( [(0, .8), (60000, .4)], outside_value=.4) #LinearSchedule(iterations*60/100, 0.2, 0.1) if use_priority: beta_schedule = LinearSchedule(iterations, 0.4, 1.) learning_rate_schedule = PiecewiseSchedule([(0, 1e-3), (30000, 5e-4), (60000, 1e-4)], outside_value=1e-4) policy_learning_rate_schedule = PiecewiseSchedule([(0, 1e-3), (4000, 5e-4), (20000, 1e-4)], outside_value=1e-4) saver = tf.train.Saver(max_to_keep=2) # summary_writer = tf.summary.FileWriter(logdir) ## Load model from where you left off ## Does not play nice w/ plots in tensorboard at the moment ## TODO: FIX if load_model == True: try: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(logdir) saver.restore(sess, ckpt.model_checkpoint_path) iteration_offset = int( ckpt.model_checkpoint_path.split('-')[-1].split('.')[0]) except: print('Failed to load. Starting from scratch') sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iteration_offset = 0 else: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iteration_offset = 0 summary_writers[0].add_graph(sess.graph) ################################################################ # Train Loop ################################################################ tic = time.time() total_timesteps = 0 while not all([ network.buffer.full(N=int(buffer_size / 2.)) for network in networks ]): networks[0].buffer.games_played += 1 print 'Game number: %s. Buffer_sizes: %s' % ( networks[0].buffer.games_played, [network.buffer.buffer_size for network in networks]) obs = env.reset() done_n = np.array([False] * env.n_actors) steps = 0 length_alive = np.array([0] * env.n_actors) viewer = None while not done_n.all(): length_alive[env.world.idxs_of_alive_snakes] += 1 last_obs = obs acts = [] for i, network in enumerate(networks): act = network.greedy_select( np.array([[x.A for x in get_data(last_obs, i)]]), 1.) acts += [str(act[0])] # Next step obs, reward_n, done_n = env.step(acts) steps += 1 for i in env.world.idxs_of_alive_snakes: priority = networks[i].get_error( np.array(get_data(last_obs, i)), np.array(acts[i]), np.array(reward_n[i]), np.array(get_data(obs, i)), np.array(done_n[i])) networks[i].store( np.array(get_data(last_obs, i)), # state np.array(acts[i]), # action np.array(reward_n[i]), #rewards np.array(get_data(obs, i)), #new state np.array(done_n[i]), #done priority=priority) # networks[i].store_reservoir(np.array(get_data(last_obs, i)), # state # np.array(int(acts[i]))) # terminate the collection of data if the controller shows stability # for a long time. This is a good thing. if steps > maximum_number_of_steps: done_n[:] = True print 'Filled Buffer' to_learn = np.array([0] * env.n_actors) frames_seen = np.array([0] * env.n_actors) for iteration in range(iteration_offset, iteration_offset + iterations + 1): print('{0} Iteration {1} {0}'.format('*' * 10, iteration)) networks[0].buffer.soft_reset() timesteps_in_iteration = 0 if (iteration % update_freq == 0): saver.save( sess, os.path.join(logdir, 'model-' + str(iteration) + '.cptk')) print "Saved Model. Timestep count: %s" % iteration total_number_of_steps_in_iteration = 0 total_reward = np.array([0] * env.n_actors) while True: networks[0].buffer.games_played += 1 if (((networks[0].buffer.games_played) % 10) == 0): print 'Epoch: %s. Game number: %s' % ( iteration, networks[0].buffer.games_played) obs = env.reset() # raw_observations = [] # raw_observations.append(np.array(obs)) animate_episode = ((networks[0].buffer.games_played - 1) == 0) and (iteration % update_freq == 0) and animate done_n = np.array([False] * env.n_actors) steps = 0 # Runs policy, collects observations and rewards viewer = None length_alive = np.array([0] * env.n_actors) game_time = time.time() action_times = [] learn_times = [] select_from_average = np.array([True] * env.n_actors) for idx in range(select_from_average.shape[0]): r = np.random.uniform() eta = eta_schedule.value(iteration) if (eta > 0) and (r <= eta): select_from_average[idx] = False # Sample from greedy while not done_n.all(): if animate_episode: if (not viewer) and (not headless): from gym.envs.classic_control import rendering viewer = rendering.SimpleImageViewer() rgb = env.render('rgb_array', headless=headless) scaler = 10 rgb = repeat_upsample(rgb, scaler, scaler) if not headless: viewer.imshow(rgb) time.sleep(.01) monitor.add(rgb, iteration, networks[0].buffer.games_played) length_alive[env.world.idxs_of_alive_snakes] += 1 to_learn[env.world.idxs_of_alive_snakes] += 1 # ob = get_data(np.array(raw_observations)[-2:]) last_obs = obs # Control the exploration acts = [] action_time = time.time() for i, network in enumerate(networks): if env.world.snakes[i].alive: act = network.select_from_policy( np.array([[x.A for x in get_data(last_obs, i)]]), epsilon_schedule.value(iteration), select_from_average[i]) acts += [str(act[0])] else: acts += [str(0)] action_times.append(time.time() - action_time) # Next step obs, reward_n, done_n = env.step(acts) total_reward += np.array(reward_n) total_number_of_steps_in_iteration += 1 steps += 1 for i in env.world.idxs_of_alive_snakes: priority = networks[i].get_error( np.array(get_data(last_obs, i)), np.array(acts[i]), np.array(reward_n[i]), np.array(get_data(obs, i)), np.array(done_n[i])) networks[i].store( np.array(get_data(last_obs, i)), # state np.array(acts[i]), # action np.array(reward_n[i]), #rewards np.array(get_data(obs, i)), #new state np.array(done_n[i]), #done priority=priority) if not select_from_average[i]: networks[i].store_reservoir( np.array(get_data(last_obs, i)), # state np.array(int(acts[i]))) # max: to cover all new steps added to buffer, min: to not overdo too much learn_time = time.time() for network_id in [ x for x in range(len(to_learn)) if to_learn[x] >= max( networks[x].batch_size, networks[x].avg_policy_batch_size) ]: to_learn[network_id] = 0 network = networks[network_id] for _ in range(5): frames_seen[network_id] += networks[ network_id].batch_size if use_priority: network.train_step(learning_rate_schedule, beta_schedule) else: network.train_step(learning_rate_schedule) for _ in range(5): if network.reservoir.buffer_size > 0: network.avg_policy_train_step( policy_learning_rate_schedule) learn_times.append(time.time() - learn_time) # terminate the collection of data if the controller shows stability # for a long time. This is a good thing. if steps > maximum_number_of_steps: done_n[:] = True if viewer: viewer.close() if networks[0].buffer.games_played >= 1: break game_time = time.time() - game_time monitor.make_gifs(iteration) for count, writer in enumerate(summary_writers[:-1]): summary = tf.Summary() summary.value.add(tag='Average Reward', simple_value=(total_reward[count])) summary.value.add(tag='Steps Taken', simple_value=(length_alive[count])) summary.value.add(tag='Frames Seen', simple_value=frames_seen[count]) writer.add_summary(summary, iteration) writer.flush() summary = tf.Summary() summary.value.add(tag='Time Elapsed/Game', simple_value=game_time) summary.value.add(tag='Time Elapsed/Total Actions', simple_value=np.sum(action_times)) summary.value.add(tag='Time Elapsed/Mean Actions', simple_value=np.mean(action_times)) summary.value.add(tag='Time Elapsed/Max Actions', simple_value=np.max(action_times)) summary.value.add(tag='Time Elapsed/Min Actions', simple_value=np.min(action_times)) summary.value.add(tag='Time Elapsed/Total Learn', simple_value=np.sum(learn_times)) summary.value.add(tag='Time Elapsed/Mean Learn', simple_value=np.mean(learn_times)) summary.value.add(tag='Time Elapsed/Max Learn', simple_value=np.max(learn_times)) summary.value.add(tag='Time Elapsed/Min Learn', simple_value=np.min(learn_times)) summary_writers[-1].add_summary(summary, iteration) summary_writers[-1].flush() print game_time, sum(action_times), sum(learn_times)
def fit( env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model model = DeepDQN() sess = model.init_session().__enter__() # capture the shape outside the closure so that the env object is # not serialized by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, train, update_target, debug = model.build_train( make_obs_ph, q_func, env.action_space.n, tf.train.AdamOptimizer(learning_rate=lr), 10, gamma, param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. model.init_vars() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: model.load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence # between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with # eps = exploration.value(t). See Appendix C.1 in # Parameter Space Noise for Exploration, Plappert et # al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = \ update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act( np.array(obs)[None], update_eps=update_eps, **kwargs )[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t) ) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = \ replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train( obses_t, actions, rewards, obses_tp1, dones, weights ) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities( batch_idxes, new_priorities ) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward) ) model.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward) ) model.load_state(model_file) return act
def train_DQNs (sess, DQNs, spec_params, tester, curriculum, show_print, render): # Initializing parameters dqns = DQNs[spec_params.ltl_spec] training_params = tester.training_params testing_params = tester.testing_params env = Game(spec_params) obs_proxy = Obs_Proxy(env) agents = env.agents action_set = env.get_actions(agents[0]) # NOTE: only if they all have the same action set # All the agents have the same observation num_features = len(obs_proxy.get_observation(env, env.agents[0])) max_steps = training_params.max_timesteps_per_spec Replay_buffers = {} for agent in agents: Replay_buffers[str(agent)] = IDQNReplayBuffer( training_params.replay_size) exploration = LinearSchedule( schedule_timesteps = int(training_params.exploration_frac \ * max_steps), initial_p=1.0, final_p = training_params.final_exploration) training_reward = 0 last_ep_rew = 0 episode_count = 0 # episode counter rew_batch = np.zeros(100) if show_print: print("Executing ", max_steps, " steps...") if render: env.show_map() #We start iterating with the environment for t in range (max_steps): actions = [] for agent, dqn in zip(agents.values(), dqns.values()): # Getting the current state and ltl goal s1 = obs_proxy.get_observation(env, agent) # Choosing an action to perform if random.random() < exploration.value(t): act = random.choice(action_set) # take random actions else: act = Actions(dqn.get_best_action(s1.reshape((1,num_features)))) # print("Act", act) actions.append(act) dqn.add_step() # updating the curriculum curriculum.add_step() # Executing the action reward = env.execute_actions(actions) if render and ep_c%30 is 0: time.sleep(0.01) clear_screen() env.show_map() training_reward += reward for agent, dqn, act in zip(agents.values(), dqns.values(), actions): # Saving this transition s2 = obs_proxy.get_observation(env, agent) # adding the DFA state to the features done = env.ltl_game_over or env.env_game_over dqn.save_transition(s1, act, reward, s2, done) # Learning if dqn.get_steps() > training_params.learning_starts and \ dqn.get_steps() % training_params.values_network_update_freq \ == 0: dqn.learn() # Updating the target network if dqn.get_steps() > training_params.learning_starts and \ dqn.get_steps() % training_params.target_network_update_freq\ == 0: dqn.update_target_network() # Printing if show_print and (dqns['0'].get_steps()+1) \ % training_params.print_freq == 0: print("Step:", dqns['0'].get_steps()+1, "\tTotal reward:", last_ep_rew, "\tSucc rate:", "%0.3f"%curriculum.get_succ_rate(), "\tNumber of episodes:", episode_count) # Testing if testing_params.test and (curriculum.get_current_step() \ % testing_params.test_freq == 0): tester.run_test(curriculum.get_current_step(), sess, _test_DQN, DQNs) # Restarting the environment (Game Over) if done: # Game over occurs for one of three reasons: # 1) DFA reached a terminal state, # 2) DFA reached a deadend, or # 3) The agent reached an environment deadend (e.g. a PIT) # Restarting env = Game(spec_params) obs_proxy = Obs_Proxy(env) agents = env.agents rew_batch[episode_count%100]= training_reward episode_count+=1 last_ep_rew = training_reward training_reward = 0 # updating the hit rates curriculum.update_succ_rate(t, reward) # Uncomment if want to stop learning according to succ. rate # if curriculum.stop_spec(t): # last_ep_rew = 0 # if show_print: print("STOP SPEC!!!") # break # checking the steps time-out if curriculum.stop_learning(): if show_print: print("STOP LEARNING!!!") break if show_print: print("Done! Last reward:", last_ep_rew)
class CategoricalDQNAgent(BaseAgent): def __init__(self, game, sess, nb_actions, global_step): BaseAgent.__init__(self, game, sess, nb_actions, global_step) self.name = "CategoricalDQN_agent" self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.algorithm) self.support = np.linspace(FLAGS.v_min, FLAGS.v_max, FLAGS.nb_atoms) self.delta_z = (FLAGS.v_max - FLAGS.v_min) / (FLAGS.nb_atoms - 1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.episode_max_values = [] self.episode_min_values = [] self.episode_mean_returns = [] self.episode_max_returns = [] self.episode_min_returns = [] self.exploration = LinearSchedule(FLAGS.explore_steps, FLAGS.final_random_action_prob, FLAGS.initial_random_action_prob) self.summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir, FLAGS.algorithm)) self.summary = tf.Summary() self.q_net = CategoricalDQNetwork(nb_actions, 'orig') self.target_net = CategoricalDQNetwork(nb_actions, 'target') self.targetOps = self.update_target_graph('orig', 'target') self.probability_of_random_action = self.exploration.value(0) def train(self): minibatch = random.sample(self.episode_buffer, FLAGS.batch_size) rollout = np.array(minibatch) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] next_observations = rollout[:, 3] done = rollout[:, 4] # Compute target distribution of Q(s_,a) target_probs_reprojected = self.get_target_distribution(rewards, done, next_observations, observations, actions) feed_dict = {self.q_net.target_q: target_probs_reprojected, self.q_net.inputs: np.stack(observations, axis=0), self.q_net.actions: actions} l, _, ms, img_summ, q, q_distrib = self.sess.run( [self.q_net.action_value_loss, self.q_net.train_op, self.q_net.merged_summary, self.q_net.image_summaries, self.q_net.action_value, self.q_net.action_values_soft], feed_dict=feed_dict) # self.updateTarget() return l / len(rollout), ms, img_summ def updateTarget(self): for op in self.targetOps: self.sess.run(op) def eval(self, saver): self.saver = saver total_steps = 0 episode_rewards = [] print("Starting eval agent") with self.sess.as_default(), self.graph.as_default(): while total_steps < FLAGS.test_episodes: episode_reward = 0 episode_step_count = 0 d = False s = self.env.get_initial_state() while not d: a = self.policy_evaluation_eval(s) s1, r, d, info = self.env.step(a) r = np.clip(r, -1, 1) episode_reward += r episode_step_count += 1 s = s1 print("Episode reward was {}".format(episode_reward)) episode_rewards.append(episode_reward) total_steps += 1 print("Mean reward is {}".format(np.mean(np.asarray(episode_rewards)))) def play(self, saver): self.saver = saver train_stats = None # self.episode_count = self.sess.run(self.global_episode) self.total_steps = self.sess.run(self.global_step) if self.total_steps == 0: self.updateTarget() print("Starting agent") _t = {'episode': Timer(), "step": Timer()} with self.sess.as_default(), self.graph.as_default(): while self.total_steps < FLAGS.max_total_steps: _t["episode"].tic() if self.total_steps % FLAGS.target_update_freq == 0: self.updateTarget() episode_reward = 0 episode_step_count = 0 q_values = [] d = False # self.probability_of_random_action = self.exploration.value(self.total_steps) s = self.env.get_initial_state() while not d: _t["step"].tic() a, max_action_values_evaled = self.policy_evaluation(s) if max_action_values_evaled is None: q_values.append(0) else: q_values.append(max_action_values_evaled) s1, r, d, info = self.env.step(a) r = np.clip(r, -1, 1) episode_reward += r episode_step_count += 1 self.total_steps += 1 self.episode_buffer.append([s, a, r, s1, d]) s = s1 if len(self.episode_buffer) == FLAGS.memory_size: self.episode_buffer.popleft() if self.total_steps > FLAGS.observation_steps and len( self.episode_buffer) > FLAGS.observation_steps and self.total_steps % FLAGS.update_freq == 0: l, ms, img_summ = self.train() train_stats = l, ms, img_summ _t["step"].toc() self.sess.run(self.increment_global_step) self.add_summary(episode_reward, episode_step_count, q_values, train_stats) if self.total_steps % FLAGS.eval_interval == 0: self.evaluate_episode() # self.sess.run(self.increment_global_episode) _t["episode"].toc() print('Avg time per step is {:.3f}'.format(_t["step"].average_time())) print('Avg time per episode is {:.3f}'.format(_t["episode"].average_time())) # fps = self.total_steps / _t['Total'].duration # print('Average time per episod is {}'.format(_t['episode'].average_time)) def add_summary(self, episode_reward, episode_step_count, q_values, train_stats): self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) if len(q_values): self.episode_mean_values.append(np.mean(np.asarray(q_values))) self.episode_max_values.append(np.max(np.asarray(q_values))) self.episode_min_values.append(np.min(np.asarray(q_values))) if self.total_steps % FLAGS.summary_interval == 0 and self.total_steps != 0 and self.total_steps > FLAGS.observation_steps: if self.total_steps % FLAGS.checkpoint_interval == 0: self.save_model(self.saver, self.total_steps) l, ms, img_summ = train_stats # self.episode_mean_returns.append(np.mean(np.asarray(returns))) # self.episode_max_returns.append(np.max(np.asarray(returns))) # self.episode_min_returns.append(np.min(np.asarray(returns))) mean_reward = np.mean(self.episode_rewards[-FLAGS.summary_interval:]) mean_length = np.mean(self.episode_lengths[-FLAGS.summary_interval:]) mean_value = np.mean(self.episode_mean_values[-FLAGS.summary_interval:]) max_value = np.mean(self.episode_max_values[-FLAGS.summary_interval:]) min_value = np.mean(self.episode_min_values[-FLAGS.summary_interval:]) # if episode_count % FLAGS.test_performance_interval == 0: # won_games = self.episode_rewards[-FLAGS.test_performance_interval:].count(1) # self.summary.value.add(tag='Perf/Won Games/1000', simple_value=float(won_games)) self.summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) self.summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) self.summary.value.add(tag='Perf/Value_Mean', simple_value=float(mean_value)) self.summary.value.add(tag='Perf/Value_Max', simple_value=float(max_value)) self.summary.value.add(tag='Perf/Value_Min', simple_value=float(min_value)) # self.summary.value.add(tag='Perf/Return_Mean', simple_value=float(mean_return)) # self.summary.value.add(tag='Perf/Return_Max', simple_value=float(max_return)) # self.summary.value.add(tag='Perf/Return_Min', simple_value=float(min_return)) self.summary.value.add(tag='Perf/Probability_random_action', simple_value=float(self.probability_of_random_action)) self.summary.value.add(tag='Losses/Loss', simple_value=float(l)) self.write_summary(ms, img_summ) def policy_evaluation(self, s): q = None self.probability_of_random_action = self.exploration.value(self.total_steps) if random.random() <= self.probability_of_random_action: a = np.random.choice(range(len(self.env.gym_actions))) else: feed_dict = {self.q_net.inputs: [s]} probs = self.sess.run(self.q_net.action_values_soft, feed_dict=feed_dict)[0] q = np.sum( np.multiply(probs, np.tile(np.expand_dims(self.support, 0), [self.nb_actions, 1])), 1) a = np.argmax(q) # a_one_hot = np.zeros(shape=(self.q_net.nb_actions, FLAGS.nb_atoms), dtype=np.int32) # a_one_hot[a] = 1 # p_a_star = np.sum(np.multiply(probs, a_one_hot), 0) # import matplotlib.pyplot as plt # ax = plt.subplot(111) # p1 = ax.step(self.support, p_a_star, color='blue') # # p2 = ax.step(skewed_support[0], p_a_star[0], color='magenta') # # p3 = ax.step(bellman[0], p_a_star[0], color='green') # # p4 = ax.step(self.support, m[0], color='red') # ax.autoscale(tight=True) # # plt.show() return a, np.max(q) def policy_evaluation_eval(self, s): feed_dict = {self.q_net.inputs: [s]} action_values_evaled = self.sess.run(self.q_net.action_values_soft, feed_dict=feed_dict)[0] action_values_q = np.sum( np.multiply(action_values_evaled, np.tile(np.expand_dims(self.support, 0), [self.nb_actions, 1])), 1) a = np.argmax(action_values_q) a_one_hot = np.zeros(shape=(self.q_net.nb_actions, FLAGS.nb_atoms), dtype=np.int32) a_one_hot[a] = 1 p_a_star = np.sum(np.multiply(action_values_evaled, a_one_hot), 0) # import matplotlib.pyplot as plt # ax = plt.subplot(111) # p1 = ax.step(self.support, p_a_star, color='blue') # # p2 = ax.step(skewed_support[0], p_a_star[0], color='magenta') # # p3 = ax.step(bellman[0], p_a_star[0], color='green') # # p4 = ax.step(self.support, m[0], color='red') # ax.autoscale(tight=True) # # plt.show() return a def get_target_distribution(self, rewards, done, next_observations, observations, actions): target_probs, probs = self.sess.run([self.target_net.action_values_soft, self.q_net.action_values_soft], feed_dict={ self.target_net.inputs: np.stack(next_observations, axis=0), self.q_net.inputs: np.stack(observations, axis=0) }) # a_one_hot = np.zeros(shape=(FLAGS.batch_size, self.q_net.nb_actions, FLAGS.nb_atoms), dtype=np.int32) # a_one_hot[np.arange(FLAGS.batch_size), np.asarray(actions, dtype=np.int32)] = 1 # pt_a_star = np.sum(np.multiply(action_values_evaled, a_one_hot), axis=1) target_q = np.sum( target_probs * np.tile(np.expand_dims(np.expand_dims(self.support, 0), 0), [FLAGS.batch_size, self.q_net.nb_actions, 1]), 2) target_a = np.argmax(target_q, axis=1) # target_a = np.tile(np.expand_dims(np.expand_dims(target_a, 1), 2), [1, 1, FLAGS.nb_atoms]) target_a_one_hot = np.zeros(shape=(FLAGS.batch_size, self.q_net.nb_actions, FLAGS.nb_atoms), dtype=np.int32) # target_a_one_hot[np.arange(FLAGS.batch_size), target_a] = 1 target_a_one_hot[np.arange(FLAGS.batch_size), target_a] = 1 # target_a_one_hot = np.tile(np.expand_dims(target_a_one_hot, 2), [1, 1, FLAGS.nb_atoms]) # a_one_hot = np.reshape(a_one_hot, (FLAGS.batch_size, self.q_net.nb_actions, FLAGS.nb_atoms)) # p_a_star = np.squeeze(np.take(target_probs, target_a), 1) p_a_star = np.sum(np.multiply(target_probs, target_a_one_hot), axis=1) rewards = np.tile(np.expand_dims(np.asarray(rewards, dtype=np.float32), 1), [1, FLAGS.nb_atoms]) gamma = np.tile(np.expand_dims(np.asarray(np.logical_not(done), dtype=np.int32) * FLAGS.gamma, 1), [1, FLAGS.nb_atoms]) # Compute projection of the application of the Bellman operator. skewed_support = gamma * np.tile(np.expand_dims(self.support, 0), [FLAGS.batch_size, 1]) bellman = rewards + skewed_support bellman = np.clip(bellman, FLAGS.v_min, FLAGS.v_max) # Compute categorical indices for distributing the probability m = np.zeros(shape=(FLAGS.batch_size, FLAGS.nb_atoms)) b = (bellman - FLAGS.v_min) / self.delta_z l = np.asarray(np.floor(b), dtype=np.int32) u = np.asarray(np.ceil(b), dtype=np.int32) # Distribute probability # for j in range(FLAGS.nb_atoms): # m[:, l[:, j]] += target_actionv_values_evaled_max[:, j] * (u[:, j] - b[:, j]) # m[:, u[:, j]] += target_actionv_values_evaled_max[:, j] * (b[:, j] - l[:, j]) for i in range(FLAGS.batch_size): for j in range(FLAGS.nb_atoms): uidx = u[i][j] lidx = l[i][j] m[i][lidx] = m[i][lidx] + p_a_star[i][j] * (uidx - b[i][j]) m[i][uidx] = m[i][uidx] + p_a_star[i][j] * (b[i][j] - lidx) # if self.total_steps > FLAGS.explore_steps: # import matplotlib.pyplot as plt # ax = plt.subplot(111) # # p1 = ax.step(self.support, p_a_star[0], color='blue') # # p2 = ax.step(skewed_support[0], p_a_star[0], color='magenta') # # p3 = ax.step(bellman[0], p_a_star[0], color='green') # # p4 = ax.step(self.support, m[0], color='red') # p4 = ax.step(self.support, pt_a_star[1], color='cyan') # ax.autoscale(tight=True) # # plt.show() return m def evaluate_episode(self): episode_reward = 0 episode_step_count = 0 d = False s = self.env.get_initial_state() while not d: a = self.policy_evaluation_eval(s) s1, r, d, info = self.env.step(a) r = np.clip(r, -1, 1) episode_reward += r episode_step_count += 1 s = s1 print("Episode reward was {}".format(episode_reward))
def train(): start_time = time.time() if seed is not None: set_seed(seed) # setup env envs = EnvWrapper(num_processes, simulator, env, env_config, planner_config) # setup agent agent = createAgent() if load_model_pre: agent.loadModel(load_model_pre) agent.train() # logging simulator_str = copy.copy(simulator) if simulator == 'pybullet': simulator_str += ('_' + robot) log_dir = os.path.join(log_pre, '{}'.format(alg)) if note: log_dir += '_' log_dir += note logger = Logger(log_dir, env, 'train', num_processes, max_episode, log_sub) hyper_parameters['model_shape'] = agent.getModelStr() logger.saveParameters(hyper_parameters) if buffer_type == 'expert': replay_buffer = QLearningBufferExpert(buffer_size) else: replay_buffer = QLearningBuffer(buffer_size) exploration = LinearSchedule(schedule_timesteps=explore, initial_p=init_eps, final_p=final_eps) states, in_hands, obs = envs.reset() if load_sub: logger.loadCheckPoint(os.path.join(log_dir, load_sub, 'checkpoint'), envs, agent, replay_buffer) # pre train if load_buffer is not None and not load_sub: logger.loadBuffer(replay_buffer, load_buffer, load_n) if pre_train_step > 0: pbar = tqdm(total=pre_train_step) while len(logger.losses) < pre_train_step: t0 = time.time() train_step(agent, replay_buffer, logger) if logger.num_training_steps % 1000 == 0: logger.saveLossCurve(100) logger.saveTdErrorCurve(100) if not no_bar: pbar.set_description('loss: {:.3f}, time: {:.2f}'.format( float(logger.getCurrentLoss()), time.time() - t0)) pbar.update(len(logger.losses) - pbar.n) if (time.time() - start_time) / 3600 > time_limit: logger.saveCheckPoint(args, envs, agent, replay_buffer) exit(0) pbar.close() logger.saveModel(0, 'pretrain', agent) # agent.sl = sl if not no_bar: pbar = tqdm(total=max_episode) pbar.set_description( 'Episodes:0; Reward:0.0; Explore:0.0; Loss:0.0; Time:0.0') timer_start = time.time() obs = obs.permute(0, 3, 1, 2) in_hands = in_hands.permute(0, 3, 1, 2) while logger.num_episodes < max_episode: # add noise if perlin: addPerlinNoiseToObs(obs, perlin) addPerlinNoiseToInHand(in_hands, perlin) if fixed_eps: if logger.num_episodes < planner_episode: eps = 1 else: eps = final_eps else: eps = exploration.value(logger.num_episodes) if planner_episode > logger.num_episodes: if np.random.random() < eps: is_expert = 1 plan_actions = envs.getNextAction() actions_star_idx, actions_star = agent.getActionFromPlan( plan_actions) else: is_expert = 0 q_value_maps, actions_star_idx, actions_star = agent.getEGreedyActions( states, in_hands, obs, final_eps) else: is_expert = 0 q_value_maps, actions_star_idx, actions_star = agent.getEGreedyActions( states, in_hands, obs, eps) if alg.find('dagger') >= 0: plan_actions = envs.getNextAction() planner_actions_star_idx, planner_actions_star = agent.getActionFromPlan( plan_actions) buffer_obs = getCurrentObs(in_hands, obs) actions_star = torch.cat((actions_star, states.unsqueeze(1)), dim=1) envs.stepAsync(actions_star, auto_reset=False) if len(replay_buffer) >= training_offset: for training_iter in range(training_iters): train_step(agent, replay_buffer, logger) states_, in_hands_, obs_, rewards, dones = envs.stepWait() steps_lefts = envs.getStepLeft() obs_ = obs_.permute(0, 3, 1, 2) in_hands_ = in_hands_.permute(0, 3, 1, 2) done_idxes = torch.nonzero(dones).squeeze(1) if done_idxes.shape[0] != 0: reset_states_, reset_in_hands_, reset_obs_ = envs.reset_envs( done_idxes) reset_obs_ = reset_obs_.permute(0, 3, 1, 2) reset_in_hands_ = reset_in_hands_.permute(0, 3, 1, 2) for j, idx in enumerate(done_idxes): states_[idx] = reset_states_[j] in_hands_[idx] = reset_in_hands_[j] obs_[idx] = reset_obs_[j] buffer_obs_ = getCurrentObs(in_hands_, obs_) for i in range(num_processes): if alg.find('dagger') >= 0: replay_buffer.add( ExpertTransition(states[i], buffer_obs[i], planner_actions_star_idx[i], rewards[i], states_[i], buffer_obs_[i], dones[i], steps_lefts[i], torch.tensor(is_expert))) else: replay_buffer.add( ExpertTransition(states[i], buffer_obs[i], actions_star_idx[i], rewards[i], states_[i], buffer_obs_[i], dones[i], steps_lefts[i], torch.tensor(is_expert))) logger.stepBookkeeping(rewards.numpy(), steps_lefts.numpy(), dones.numpy()) states = copy.copy(states_) obs = copy.copy(obs_) in_hands = copy.copy(in_hands_) if (time.time() - start_time) / 3600 > time_limit: break if not no_bar: timer_final = time.time() description = 'Steps:{}; Reward:{:.03f}; Explore:{:.02f}; Loss:{:.03f}; Time:{:.03f}'.format( logger.num_steps, logger.getCurrentAvgReward(1000), eps, float(logger.getCurrentLoss()), timer_final - timer_start) pbar.set_description(description) timer_start = timer_final pbar.update(logger.num_episodes - pbar.n) logger.num_steps += num_processes if logger.num_steps % (num_processes * save_freq) == 0: saveModelAndInfo(logger, agent) saveModelAndInfo(logger, agent) logger.saveCheckPoint(args, envs, agent, replay_buffer) envs.close()
def _run_ILPOPL(sess, policy_banks, spec_params, tester, curriculum, show_print, render): # Initializing parameters training_params = tester.training_params testing_params = tester.testing_params # Initializing the game env = Game(spec_params) agents = env.agents action_set = env.get_actions(agents[0]) # Initializing experience replay buffers replay_buffers = {} for agent in range(env.n_agents): replay_buffers[str(agent)] = ReplayBuffer(training_params.replay_size) # Initializing parameters num_features = len(env.get_observation(agents[0])) max_steps = training_params.max_timesteps_per_spec exploration = LinearSchedule(schedule_timesteps=int( training_params.exploration_frac * max_steps), initial_p=1.0, final_p=training_params.final_exploration) last_ep_rew = 0 training_reward = 0 episode_count = 0 # Starting interaction with the environment if show_print: print("Executing", max_steps, "actions...") if render: env.show_map() #We start iterating with the environment for t in range(max_steps): # Getting the current state and ltl goal actions = [] ltl_goal = env.get_LTL_goal() for agent, policy_bank in zip(agents.values(), policy_banks.values()): s1 = env.get_observation(agent) # Choosing an action to perform if random.random() < exploration.value(t): act = random.choice(action_set) else: act = Actions( policy_bank.get_best_action(ltl_goal, s1.reshape((1, num_features)))) actions.append(act) # updating the curriculum curriculum.add_step() # Executing the action reward = env.execute_actions(actions) training_reward += reward if render and episode_count % 30 is 0: time.sleep(0.01) clear_screen() env.show_map() true_props = [] for agent in agents.values(): true_props.append(env.get_true_propositions(agent)) # Saving this transition for agent, policy_bank, replay_buffer, act in zip( agents.values(), policy_banks.values(), replay_buffers.values(), actions): s2 = env.get_observation(agent) next_goals = np.zeros((policy_bank.get_number_LTL_policies(), ), dtype=np.float64) for ltl in policy_bank.get_LTL_policies(): ltl_id = policy_bank.get_id(ltl) if env.env_game_over: # env deadends are equal to achive the 'False' formula ltl_next_id = policy_bank.get_id("False") else: for props in true_props: ltl_next_id = policy_bank.get_id(\ policy_bank.get_policy_next_LTL(ltl, props)) next_goals[ltl_id - 2] = ltl_next_id replay_buffer.add(s1, act.value, s2, next_goals) # Learning if curriculum.get_current_step() > training_params.learning_starts\ and curriculum.get_current_step() %\ training_params.values_network_update_freq == 0: # Minimize the error in Bellman's equation on a batch sampled # from replay buffer. S1, A, S2, Goal = replay_buffer.sample( training_params.batch_size) policy_bank.learn(S1, A, S2, Goal) # Updating the target network if curriculum.get_current_step() > training_params.learning_starts\ and curriculum.get_current_step() %\ training_params.target_network_update_freq == 0: # Update target network periodically. policy_bank.update_target_network() # Printing if show_print and (curriculum.get_current_step()+1) \ % training_params.print_freq == 0: print("Step:", curriculum.get_current_step() + 1, "\tLast episode reward:", last_ep_rew, "\tSucc rate:", "%0.3f" % curriculum.get_succ_rate(), "\tNumber of episodes:", episode_count) # Testing if testing_params.test and curriculum.get_current_step() %\ testing_params.test_freq == 0: tester.run_test(curriculum.get_current_step(), sess, _test_ILPOPL, policy_banks, num_features) # Restarting the environment (Game Over) if env.ltl_game_over or env.env_game_over: # NOTE: Game over occurs for one of three reasons: # 1) DFA reached a terminal state, # 2) DFA reached a deadend, or # 3) The agent reached an environment deadend (e.g. a PIT) env = Game(spec_params) # Restarting agents = env.agents episode_count += 1 last_ep_rew = training_reward training_reward = 0 # updating the hit rates curriculum.update_succ_rate(t, reward) # Uncomment if want to stop learning according to succ. rate # if curriculum.stop_spec(t): # last_ep_rew = 0 # if show_print: print("STOP SPEC!!!") # break # checking the steps time-out if curriculum.stop_learning(): if show_print: print("STOP LEARNING!!!") break if show_print: print("Done! Last reward:", last_ep_rew)
def main(): L.configure('/home/metalabadmin/exp/freeway', format_strs=['stdout', 'csv', 'tensorboard']) env = gym.make('Freeway-v0') env = wrapper.wrap_deepmind(env, frame_stack=True, scale=True) optimizer = tf.train.AdamOptimizer(learning_rate=0.0001) network = Q_network(env.observation_space, env.action_space.n, optimizer, gamma=0.99, scope='freeway') m_controller = MetaController(network, env.action_space.n) # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(0.1 * 1e7), initial_p=1.0, final_p=0.02) replay = ReplayBuffer(50000) # get default tf_session sess = U.get_session() U.initialize() sess.run(m_controller.network.update_target_op) step = 0 episodes = 0 rewards = 0 mean_100ep_reward = 0 total_reward = [] saved_mean_reward = None ob = env.reset() while step <= 1e7: ep = exploration.value(step) ob_reshaped = np.reshape(ob, (1, ) + env.observation_space.shape) act = m_controller.sample_act(sess, ob_reshaped, update_eps=ep)[0] ob_tp1, reward_t, done_t, info = env.step(act) env.render() rewards += reward_t replay.add(ob, act, reward_t, ob_tp1, float(done_t)) ob = ob_tp1 # train every 4 steps if step >= 1000 and step % 4 == 0: obs, acts, rewards_t, obs_tp1, dones_t = replay.sample(64) weights, batch_idxes = np.ones_like(rewards_t), None # get q estimate for tp1 as 'supervised' obs_tp1_reshaped = np.reshape(obs_tp1, (64, ) + env.observation_space.shape) q_tp1 = m_controller.get_q(sess, obs_tp1_reshaped)[0] td_error = m_controller.train(sess, obs, acts, rewards_t, obs_tp1, dones_t, weights, q_tp1) step += 1 if step >= 1000 and step % 1000 == 0: sess.run(m_controller.network.update_target_op) if done_t: ob = env.reset() total_reward.append(rewards) episodes += 1 rewards = 0 print('step %d done %s, ep %.2f' % (step, str(done_t), ep)) mean_100ep_reward = round(np.mean(total_reward[-101:-1]), 1) if episodes % 10 == 0 and episodes != 0: print('date time %s' % str(datetime.now())) L.record_tabular("steps", step) L.record_tabular("episodes", episodes) L.record_tabular("mean 100 episode reward", mean_100ep_reward) L.dump_tabular() if step % 1000 == 0: if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: L.log("Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) U.save_variables('./freewaymodel.ckpt') model_saved = True saved_mean_reward = mean_100ep_reward
class Agent_PER(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory # Create the replay buffer self.memory = PrioritizedReplayBuffer(BUFFER_SIZE, alpha=PRIORITIZED_REPLAY_ALPHA) if PRIORITIZED_REPLAY_BETA_ITERS is None: prioritized_replay_beta_iters = N_EPISODES self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=PRIORITIZED_REPLAY_BETA0, final_p=1.0) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.t = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, float(done)) self.t += 1 # Learn every UPDATE_EVERY time steps self.t_step = (self.t_step + 1) % UPDATE_EVERY # Update target network periodically if self.t_step == 0: udpate_target_network_flag = True else: udpate_target_network_flag = False if self.t > LEARNING_STARTS and (self.t % TRAIN_FREQ) == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer experiences = self.memory.sample(BATCH_SIZE, beta=self.beta_schedule.value( self.t)) td_errors = self.learn(experiences, GAMMA, udpate_target_network_flag) (states, actions, rewards, next_states, dones, weights, batch_idxes) = experiences new_priorities = np.abs(td_errors) + PRIORITIZED_REPLAY_EPS self.memory.update_priorities(batch_idxes, new_priorities) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma, udpate_target_network_flag): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, weights, batch_idxes = experiences states = torch.from_numpy(np.vstack([state for state in states ])).float().to(device) actions = torch.from_numpy(np.vstack([action for action in actions ])).long().to(device) rewards = torch.from_numpy(np.vstack([reward for reward in rewards ])).float().to(device) next_states = torch.from_numpy( np.vstack([next_state for next_state in next_states])).float().to(device) dones = torch.from_numpy(np.vstack([done for done in dones ])).float().to(device) weights = torch.from_numpy(np.vstack([weight for weight in weights ])).float().to(device) # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute td error td_error = Q_expected - Q_targets td_error_ = td_error.detach().numpy() # Compute loss loss = td_error**2 #loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() (weights * loss).mean().backward() self.optimizer.step() if udpate_target_network_flag: # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) return td_error_ def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)