def sample_uniform(self, sequence_size): """ Uniformly samples sequence of successive frames of size `sequence_size` or less (~off-policy rollout). Args: sequence_size: maximum sample size. Returns: instance of Rollout of size <= sequence_size. """ start_pos = np.random.randint(0, self._history_size - sequence_size - 1) # Shift by one if hit terminal frame: if self._frames[start_pos]['terminal']: start_pos += 1 # assuming that there are no successive terminal frames. sampled_rollout = Rollout() for i in range(sequence_size): frame = self._frames[start_pos + i] sampled_rollout.add(frame) if frame['terminal']: break # it's ok to return less than `sequence_size` frames if `terminal` frame encountered. return sampled_rollout
def env_runner(sess, env, policy, task, rollout_length, summary_writer, episode_summary_freq, env_render_freq, atari_test, ep_summary, memory_config): """ The logic of the thread runner. In brief, it constantly keeps on running the policy, and as long as the rollout exceeds a certain length, the thread runner appends all the collected data to the queue. Args: env: environment instance policy: policy instance task: int rollout_length: int episode_summary_freq: int env_render_freq: int atari_test: bool, Atari or BTGyn ep_summary: dict of tf.summary op and placeholders memory_config: replay memory configuration dictionary Yelds: collected data as dictionary of on_policy, off_policy rollouts and episode statistics. """ if memory_config is not None: memory = memory_config['class_ref'](**memory_config['kwargs']) else: memory = _DummyMemory() # Pass sample config to environment: last_state = env.reset(**policy.get_sample_config()) last_context = policy.get_initial_features(state=last_state) length = 0 local_episode = 0 reward_sum = 0 last_action = np.zeros(env.action_space.n) last_action[0] = 1 last_reward = 0.0 last_action_reward = np.concatenate( [last_action, np.asarray([last_reward])], axis=-1) # Summary averages accumulators: total_r = [] cpu_time = [] final_value = [] total_steps = [] total_steps_atari = [] ep_stat = None test_ep_stat = None render_stat = None while True: terminal_end = False rollout = Rollout() action, value_, context = policy.act(last_state, last_context, last_action_reward) # argmax to convert from one-hot: state, reward, terminal, info = env.step(action.argmax()) # Partially collect first experience of rollout: last_experience = { 'position': { 'episode': local_episode, 'step': length }, 'state': last_state, 'action': action, 'reward': reward, 'value': value_, 'terminal': terminal, 'context': last_context, 'last_action_reward': last_action_reward, } # Execute user-defined callbacks to policy, if any: for key, callback in policy.callback.items(): last_experience[key] = callback(**locals()) length += 1 reward_sum += reward last_state = state last_context = context last_action = action last_reward = reward last_action_reward = np.concatenate( [last_action, np.asarray([last_reward])], axis=-1) for roll_step in range(1, rollout_length): if not terminal: # Continue adding experiences to rollout: action, value_, context = policy.act(last_state, last_context, last_action_reward) # Argmax to convert from one-hot: state, reward, terminal, info = env.step(action.argmax()) #if not atari_test: # state = state['model_input'] # Partially collect next experience: experience = { 'position': { 'episode': local_episode, 'step': length }, 'state': last_state, 'action': action, 'reward': reward, 'value': value_, 'terminal': terminal, 'context': last_context, 'last_action_reward': last_action_reward, #'pixel_change': 0 #policy.get_pc_target(state, last_state), } for key, callback in policy.callback.items(): experience[key] = callback(**locals()) # Bootstrap to complete and push previous experience: last_experience['r'] = value_ rollout.add(last_experience) memory.add(last_experience) # Housekeeping: length += 1 reward_sum += reward last_state = state last_context = context last_action = action last_reward = reward last_action_reward = np.concatenate( [last_action, np.asarray([last_reward])], axis=-1) last_experience = experience if terminal: # Finished episode within last taken step: terminal_end = True # All environment-specific summaries are here due to fact # only runner allowed to interact with environment: # Accumulate values for averaging: total_r += [reward_sum] total_steps_atari += [length] if not atari_test: episode_stat = env.get_stat() # get episode statistic last_i = info[-1] # pull most recent info cpu_time += [episode_stat['runtime'].total_seconds()] final_value += [last_i['broker_value']] total_steps += [episode_stat['length']] #print('last_episode.metadata:', state['metadata']) # Episode statistics: try: # Was it test episode ( `type` in metadata is not zero)? if not atari_test and state['metadata']['type']: is_test_episode = True else: is_test_episode = False except KeyError: is_test_episode = False if is_test_episode: #print(task, total_r) test_ep_stat = dict(total_r=total_r[-1], final_value=final_value[-1], steps=total_steps[-1]) else: if local_episode % episode_summary_freq == 0: if not atari_test: # BTgym: ep_stat = dict(total_r=np.average(total_r), cpu_time=np.average(cpu_time), final_value=np.average(final_value), steps=np.average(total_steps)) else: # Atari: ep_stat = dict(total_r=np.average(total_r), steps=np.average(total_steps_atari)) total_r = [] cpu_time = [] final_value = [] total_steps = [] total_steps_atari = [] if task == 0 and local_episode % env_render_freq == 0: if not atari_test: # Render environment (chief worker only, and not in atari atari_test mode): render_stat = { mode: env.render(mode)[None, :] for mode in env.render_modes } else: # Atari: render_stat = dict( render_atari=state['external'][None, :] * 255) # New episode: last_state = env.reset(**policy.get_sample_config()) last_context = policy.get_initial_features( state=last_state, context=last_context) length = 0 reward_sum = 0 last_action = np.zeros(env.action_space.n) last_action[0] = 1 last_reward = 0.0 last_action_reward = np.concatenate( [last_action, np.asarray([last_reward])], axis=-1) # Increment global and local episode counts: sess.run(policy.inc_episode) local_episode += 1 break # After rolling `rollout_length` or less (if got `terminal`) # complete final experience of the rollout: if not terminal_end: # Bootstrap: last_experience['r'] = np.asarray([ policy.get_value(last_state, last_context, last_action_reward) ]) else: last_experience['r'] = np.asarray([0.0]) rollout.add(last_experience) # Only training rollouts are added to replay memory: try: # Was it test (`type` in metadata is not zero)? if not atari_test and last_experience['state']['metadata']['type']: is_test = True else: is_test = False except KeyError: is_test = False if not is_test: memory.add(last_experience) #print('last_experience {}'.format(last_experience['position'])) #for k, v in last_experience.items(): # try: # print(k, 'shape: ', v.shape) # except: # try: # print(k, 'type: ', type(v), 'len: ', len(v)) # except: # print(k, 'type: ', type(v), 'value: ', v) #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'. # format( # length, # last_experience['position'], # last_experience['reward'], # last_experience['value'], # last_experience['value_next'], # last_experience['terminal'] # ) #) #print('rollout size: {}, last r: {}'.format(len(rollout.position), rollout.r[-1])) #print('last value_next: ', last_experience['value_next'], ', rollout flushed.') # Once we have enough experience and memory can be sampled, yield it, # and have the ThreadRunner place it on a queue: if memory.is_full(): data = dict( on_policy=rollout, off_policy=memory.sample_uniform(sequence_size=rollout_length), off_policy_rp=memory.sample_priority(exact_size=True), ep_summary=ep_stat, test_ep_summary=test_ep_stat, render_summary=render_stat, ) yield data ep_stat = None test_ep_stat = None render_stat = None
def env_runner(sess, env, policy, task, rollout_length, summary_writer, episode_summary_freq, env_render_freq, test, ep_summary): """The logic of the thread runner. In brief, it constantly keeps on running the policy, and as long as the rollout exceeds a certain length, the thread runner appends the rollout to the queue. Args: env: environment instance policy: policy instance task: int rollout_length: int episode_summary_freq: int env_render_freq: int test: Atari or BTGyn ep_summary: tf.summary Yelds: rollout instance """ last_state = env.reset() if not test: last_state = last_state['model_input'] last_context = policy.get_initial_features() length = 0 local_episode = 0 rewards = 0 last_action = np.zeros(env.action_space.n) last_action[0] = 1 last_reward = 0.0 last_action_reward = np.concatenate( [last_action, np.asarray([last_reward])], axis=-1) # Summary averages accumulators: total_r = 0 cpu_time = 0 final_value = 0 total_steps = 0 total_steps_atari = 0 while True: terminal_end = False rollout = Rollout() action, value_, context = policy.act(last_state, last_context, last_action_reward) # argmax to convert from one-hot: state, reward, terminal, info = env.step(action.argmax()) #if not test: # state = state['model_input'] # Partially collect first experience of rollout: last_experience = { 'position': { 'episode': local_episode, 'step': length }, 'state': last_state, 'action': action, 'reward': reward, 'value': value_, 'terminal': terminal, 'context': last_context, 'last_action_reward': last_action_reward, #'pixel_change': 0 #policy.get_pc_target(state, last_state), } # Execute user-defined callbacks to policy, if any: for key, callback in policy.callback.items(): last_experience[key] = callback(**locals()) length += 1 rewards += reward last_state = state last_context = context last_action = action last_reward = reward last_action_reward = np.concatenate( [last_action, np.asarray([last_reward])], axis=-1) for roll_step in range(1, rollout_length): if not terminal: # Continue adding experiences to rollout: action, value_, context = policy.act(last_state, last_context, last_action_reward) # Argmax to convert from one-hot: state, reward, terminal, info = env.step(action.argmax()) #if not test: # state = state['model_input'] # Partially collect next experience: experience = { 'position': { 'episode': local_episode, 'step': length }, 'state': last_state, 'action': action, 'reward': reward, 'value': value_, 'terminal': terminal, 'context': last_context, 'last_action_reward': last_action_reward, #'pixel_change': 0 #policy.get_pc_target(state, last_state), } for key, callback in policy.callback.items(): experience[key] = callback(**locals()) # Bootstrap to complete and push previous experience: last_experience['r'] = value_ rollout.add(last_experience) # Housekeeping: length += 1 rewards += reward last_state = state last_context = context last_action = action last_reward = reward last_action_reward = np.concatenate( [last_action, np.asarray([last_reward])], axis=-1) last_experience = experience if terminal: # Finished episode within last taken step: terminal_end = True # All environment-specific summaries are here due to fact # only runner allowed to interact with environment: # Accumulate values for averaging: total_r += rewards total_steps_atari += length if not test: episode_stat = env.get_stat() # get episode statistic last_i = info[0] # pull most recent info cpu_time += episode_stat['runtime'].total_seconds() final_value += last_i['broker_value'] total_steps += episode_stat['length'] # Episode statistic: if local_episode % episode_summary_freq == 0: if not test: # BTgym: fetched_episode_stat = sess.run( ep_summary['stat_op'], feed_dict={ ep_summary['total_r_pl']: total_r / episode_summary_freq, ep_summary['cpu_time_pl']: cpu_time / episode_summary_freq, ep_summary['final_value_pl']: final_value / episode_summary_freq, ep_summary['steps_pl']: total_steps / episode_summary_freq }) else: # Atari: fetched_episode_stat = sess.run( ep_summary['test_stat_op'], feed_dict={ ep_summary['total_r_pl']: total_r / episode_summary_freq, ep_summary['steps_pl']: total_steps_atari / episode_summary_freq }) summary_writer.add_summary(fetched_episode_stat, sess.run(policy.global_episode)) summary_writer.flush() total_r = 0 cpu_time = 0 final_value = 0 total_steps = 0 total_steps_atari = 0 if task == 0 and local_episode % env_render_freq == 0: if not test: # Render environment (chief worker only, and not in atari test mode): renderings = sess.run( ep_summary['render_op'], feed_dict={ ep_summary['render_human_pl']: env.render('human')[None, :], ep_summary['render_model_input_pl']: env.render('model_input')[None, :], ep_summary['render_episode_pl']: env.render('episode')[None, :], }) else: # Atari: renderings = sess.run( ep_summary['test_render_op'], feed_dict={ ep_summary['render_atari_pl']: state[None, :] * 255 }) summary_writer.add_summary(renderings, sess.run(policy.global_episode)) summary_writer.flush() # New episode: last_state = env.reset() #if not test: # last_state = last_state['model_input'] last_context = policy.get_initial_features() length = 0 rewards = 0 last_action = np.zeros(env.action_space.n) last_action[0] = 1 last_reward = 0.0 last_action_reward = np.concatenate( [last_action, np.asarray([last_reward])], axis=-1) # Increment global and local episode counts: sess.run(policy.inc_episode) local_episode += 1 break # After rolling `rollout_length` or less (if got `terminal`) # complete final experience of the rollout: if not terminal_end: # Bootstrap: last_experience['r'] = np.asarray([ policy.get_value(last_state, last_context, last_action_reward) ]) else: last_experience['r'] = np.asarray([0.0]) rollout.add(last_experience) #print('last_experience {}'.format(last_experience['position'])) #for k, v in last_experience.items(): # try: # print(k, 'shape: ', v.shape) # except: # try: # print(k, 'type: ', type(v), 'len: ', len(v)) # except: # print(k, 'type: ', type(v), 'value: ', v) #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'. # format( # length, # last_experience['position'], # last_experience['reward'], # last_experience['value'], # last_experience['value_next'], # last_experience['terminal'] # ) #) #print('rollout size: {}, last r: {}'.format(len(rollout.position), rollout.r[-1])) #print('last value_next: ', last_experience['value_next'], ', rollout flushed.') # Once we have enough experience, yield it, and have the ThreadRunner place it on a queue: yield rollout
def process(self, sess): """ Grabs a on_policy_rollout that's been produced by the thread runner, samples off_policy rollout[s] from replay memory and updates the parameters. The update is then sent to the parameter server. """ sess.run(self.sync) # copy weights from shared to local # Get and process on_policy_rollout for A3C train step: on_policy_rollout = self.pull_batch_from_queue() on_policy_batch = on_policy_rollout.process( gamma=self.model_gamma, gae_lambda=self.model_gae_lambda) # Feeder for on-policy A3C loss estimation graph: feed_dict = { pl: value for pl, value in zip(self.local_network.a3c_lstm_state_pl_flatten, flatten_nested(on_policy_batch.features)) } # ..passes lstm context feed_dict.update({ self.local_network.a3c_state_in: on_policy_batch.si, self.local_network.a3c_a_r_in: on_policy_batch.last_ar, self.a3c_act_target: on_policy_batch.a, self.a3c_adv_target: on_policy_batch.adv, self.a3c_r_target: on_policy_batch.r, self.local_network.train_phase: True, }) if self.use_off_policy_a3c or self.use_pixel_control or self.use_value_replay: # Get sample from replay memory: if self.use_rebalanced_replay: off_policy_sample = self.memory.sample_priority( self.replay_rollout_length, skewness=self.rebalance_skewness, exact_size=False) else: off_policy_sample = self.memory.sample_uniform( self.replay_rollout_length) off_policy_rollout = Rollout() off_policy_rollout.add_memory_sample(off_policy_sample) off_policy_batch = off_policy_rollout.process( gamma=self.model_gamma, gae_lambda=self.model_gae_lambda) # Feeder for off-policy A3C loss estimation graph: off_policy_feeder = { pl: value for pl, value in zip( self.local_network.off_a3c_lstm_state_pl_flatten, flatten_nested(off_policy_batch.features)) } off_policy_feeder.update({ self.local_network.off_a3c_state_in: off_policy_batch.si, self.local_network.off_a3c_a_r_in: off_policy_batch.last_ar, self.off_policy_act_target: off_policy_batch.a, self.off_policy_adv_target: off_policy_batch.adv, self.off_policy_r_target: off_policy_batch.r, }) feed_dict.update(off_policy_feeder) # Update with reward prediction subgraph: if self.use_reward_prediction: # Rebalanced 50/50 sample for RP: rp_sample = self.memory.sample_priority(self.rp_sequence_size, skewness=2, exact_size=True) feed_dict.update(self.process_rp(rp_sample)) # Pixel control ... if self.use_pixel_control: feed_dict.update(self.process_pc(off_policy_batch)) # VR... if self.use_value_replay: feed_dict.update(self.process_vr(off_policy_batch)) if self.use_memory: # Save on_policy_rollout to replay memory: self.memory.add_rollout(on_policy_rollout) # Every worker writes model summaries: should_compute_summary =\ self.local_steps % self.model_summary_freq == 0 # self.task == 0 and if should_compute_summary: fetches = [self.model_summary_op, self.train_op, self.global_step] else: fetches = [self.train_op, self.global_step] #print('TRAIN_FEED_DICT:\n', feed_dict) #print('\n=======S=======\n') #for key,value in feed_dict.items(): # try: # print(key,':', value.shape,'\n') # except: # print(key, ':', value, '\n') #print('\n=====E======\n') # And finally... fetched = sess.run(fetches, feed_dict=feed_dict) if should_compute_summary: self.summary_writer.add_summary(tf.Summary.FromString(fetched[0]), fetched[-1]) self.summary_writer.flush() self.local_steps += 1
def env_runner(sess, env, policy, task, num_local_steps, summary_writer, episode_summary_freq, env_render_freq, test, ep_summary): """ The logic of the thread runner. In brief, it constantly keeps on running the policy, and as long as the rollout exceeds a certain length, the thread runner appends the rollout to the queue. """ last_state = env.reset() if not test: last_state = last_state['model_input'] last_features = policy.get_a3c_initial_features() length = 0 local_episode = 0 rewards = 0 last_action = np.zeros(env.action_space.n) last_action[0] = 1 last_reward = 0.0 last_action_reward = np.concatenate( [last_action, np.asarray([last_reward])], axis=-1) # Summary averages accumulators: total_r = 0 cpu_time = 0 final_value = 0 total_steps = 0 total_steps_atari = 0 while True: terminal_end = False rollout = Rollout() # Partially collect first experience of rollout: action, value_, features = policy.a3c_act(last_state, last_features, last_action_reward) # argmax to convert from one-hot: state, reward, terminal, info = env.step(action.argmax()) if not test: state = state['model_input'] # Estimate `pixel_change`: pixel_change = policy.get_pc_target(state, last_state) # Collect the experience: frame_position = {'episode': local_episode, 'step': length} last_experience = dict( position=frame_position, state=last_state, action=action, reward=reward, value=value_, terminal=terminal, features=last_features, pixel_change=pixel_change, last_action_reward=last_action_reward, # as a[-1] ) length += 1 rewards += reward last_state = state last_features = features last_action = action last_reward = reward last_action_reward = np.concatenate( [last_action, np.asarray([last_reward])], axis=-1) for roll_step in range(1, num_local_steps): if not terminal: # Continue adding experiences to rollout: action, value_, features = policy.a3c_act( last_state, last_features, last_action_reward) # argmax to convert from one-hot: state, reward, terminal, info = env.step(action.argmax()) if not test: state = state['model_input'] pixel_change = policy.get_pc_target(state, last_state) # Partially collect next experience: frame_position = {'episode': local_episode, 'step': length} experience = dict( position=frame_position, state=last_state, action=action, reward=reward, value=value_, terminal=terminal, features=last_features, pixel_change=pixel_change, last_action_reward=last_action_reward, ) # Complete and push previous experience: last_experience['value_next'] = value_ rollout.add(**last_experience) #print ('last_experience {}'.format(last_experience['position'])) #for k,v in last_experience.items(): # try: # print(k, 'shape: ', v.shape) # except: # try: # print(k, 'type: ', type(v), 'len: ', len(v)) # except: # print(k, 'type: ', type(v), 'value: ', v) #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'. # format( # length, # last_experience['position'], # last_experience['reward'], # last_experience['value'], # last_experience['value_next'], # last_experience['terminal'] # ) #) length += 1 rewards += reward last_state = state last_features = features last_action = action last_reward = reward last_experience = experience if terminal: # Finished episode within last taken step: terminal_end = True #print("Episode finished. Sum of rewards: %d. Length: %d" % (rewards, length)) # All environment-related summaries are here due to fact # only runner allowed to interact with environment: # Accumulate values for averaging: total_r += rewards total_steps_atari += length if not test: episode_stat = env.get_stat() # get episode statistic last_i = info[0] # pull most recent info cpu_time += episode_stat['runtime'].total_seconds() final_value += last_i['broker_value'] total_steps += episode_stat['length'] # Episode statistic: if local_episode % episode_summary_freq == 0: if not test: # BTgym: fetched_episode_stat = sess.run( ep_summary['stat_op'], feed_dict={ ep_summary['total_r_pl']: total_r / episode_summary_freq, ep_summary['cpu_time_pl']: cpu_time / episode_summary_freq, ep_summary['final_value_pl']: final_value / episode_summary_freq, ep_summary['steps_pl']: total_steps / episode_summary_freq }) else: # Atari: fetched_episode_stat = sess.run( ep_summary['test_stat_op'], feed_dict={ ep_summary['total_r_pl']: total_r / episode_summary_freq, ep_summary['steps_pl']: total_steps_atari / episode_summary_freq }) summary_writer.add_summary(fetched_episode_stat, sess.run(policy.global_episode)) summary_writer.flush() total_r = 0 cpu_time = 0 final_value = 0 total_steps = 0 total_steps_atari = 0 if task == 0 and local_episode % env_render_freq == 0: if not test: # Render environment (chief worker only, and not in atari test mode): renderings = sess.run( ep_summary['render_op'], feed_dict={ ep_summary['render_human_pl']: env.render('human')[None, :], ep_summary['render_model_input_pl']: env.render('model_input')[None, :], ep_summary['render_episode_pl']: env.render('episode')[None, :], }) else: # Atari: renderings = sess.run( ep_summary['test_render_op'], feed_dict={ ep_summary['render_atari_pl']: state[None, :] * 255 }) summary_writer.add_summary(renderings, sess.run(policy.global_episode)) summary_writer.flush() # New episode: last_state = env.reset() if not test: last_state = last_state['model_input'] last_features = policy.get_a3c_initial_features() length = 0 rewards = 0 last_action = np.zeros(env.action_space.n) last_action[0] = 1 last_reward = 0.0 # Increment global and local episode counts: sess.run(policy.inc_episode) local_episode += 1 break # After rolling `num_local_steps` or less (if got `terminal`) # complete final experience of the rollout: if not terminal_end: #print('last_non_terminal_value_next_added') last_experience['value_next'] = np.asarray([ policy.get_a3c_value(last_state, last_features, last_action_reward) ]) else: #print('last_terminal_value_next_added') last_experience['value_next'] = np.asarray([0.0]) rollout.add(**last_experience) #print('last_experience {}'.format(last_experience['position'])) #for k, v in last_experience.items(): # try: # print(k, 'shape: ', v.shape) # except: # try: # print(k, 'type: ', type(v), 'len: ', len(v)) # except: # print(k, 'type: ', type(v), 'value: ', v) #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'. # format( # length, # last_experience['position'], # last_experience['reward'], # last_experience['value'], # last_experience['value_next'], # last_experience['terminal'] # ) #) #print('rollout size: {}, last r: {}'.format(len(rollout.position), rollout.r[-1])) #print('last value_next: ', last_experience['value_next'], ', rollout flushed.') # Once we have enough experience, yield it, and have the ThreadRunner place it on a queue: yield rollout
def process(self, sess): """ Grabs a on_policy_rollout that's been produced by the thread runner, samples off_policy rollout[s] from replay memory and updates the parameters. The update is then sent to the parameter server. """ # Copy weights from shared to local new_policy: sess.run(self.sync) # Get and process rollout for on-policy train step: on_policy_rollout = self.pull_batch_from_queue() on_policy_batch = on_policy_rollout.process( gamma=self.model_gamma, gae_lambda=self.model_gae_lambda) # Feeder for on-policy AAC loss estimation graph: feed_dict = { pl: value for pl, value in zip(self.local_network.on_lstm_state_pl_flatten, flatten_nested(on_policy_batch['context'])) } feed_dict.update({ self.local_network.on_state_in: on_policy_batch['state'], self.local_network.on_a_r_in: on_policy_batch['last_action_reward'], self.on_pi_act_target: on_policy_batch['action'], self.on_pi_adv_target: on_policy_batch['advantage'], self.on_pi_r_target: on_policy_batch['r'], self.local_network.train_phase: True, }) if self.use_off_policy_aac or self.use_pixel_control or self.use_value_replay: # Get sample from replay memory: if self.use_rebalanced_replay: off_policy_sample = self.memory.sample_priority( self.replay_rollout_length, skewness=self.rebalance_skewness, exact_size=False) else: off_policy_sample = self.memory.sample_uniform( self.replay_rollout_length) off_policy_rollout = Rollout() off_policy_rollout.add_memory_sample(off_policy_sample) off_policy_batch = off_policy_rollout.process( gamma=self.model_gamma, gae_lambda=self.model_gae_lambda) # Feeder for off-policy AAC loss estimation graph: off_policy_feeder = { pl: value for pl, value in zip( self.local_network.off_lstm_state_pl_flatten, flatten_nested(off_policy_batch['context'])) } off_policy_feeder.update({ self.local_network.off_state_in: off_policy_batch['state'], self.local_network.off_a_r_in: off_policy_batch['last_action_reward'], self.off_pi_act_target: off_policy_batch['action'], self.off_pi_adv_target: off_policy_batch['advantage'], self.off_pi_r_target: off_policy_batch['r'], }) feed_dict.update(off_policy_feeder) # Update with reward prediction subgraph: if self.use_reward_prediction: # Rebalanced 50/50 sample for RP: rp_sample = self.memory.sample_priority(self.rp_sequence_size, skewness=2, exact_size=True) feed_dict.update(self.process_rp(rp_sample)) # Pixel control ... if self.use_pixel_control: feed_dict.update(self.process_pc(off_policy_batch)) # VR... if self.use_value_replay: feed_dict.update(self.process_vr(off_policy_batch)) if self.use_memory: # Save on_policy_rollout to replay memory: self.memory.add_rollout(on_policy_rollout) # Every worker writes model summaries: should_compute_summary =\ self.local_steps % self.model_summary_freq == 0 fetches = [self.train_op] if should_compute_summary: fetches = [self.train_op, self.model_summary_op, self.inc_step] else: fetches = [self.train_op, self.inc_step] fetched = sess.run(fetches, feed_dict=feed_dict) if should_compute_summary: self.summary_writer.add_summary(tf.Summary.FromString(fetched[-2]), fetched[-1]) self.summary_writer.flush() self.local_steps += 1
def _sample_priority(self, size=None, exact_size=False, skewness=2, sample_attempts=100): """ Implements rebalanced replay. Samples sequence of successive frames from distribution skewed by means of reward of last sample frame. Args: size: sample size, must be <= self.max_sample_size; exact_size: whether accept sample with size less than 'size' or re-sample to get sample of exact size (used for reward prediction task); skewness: int>=1, sampling probability denominator, such as probability of sampling sequence with last frame having non-zero reward is: p[non_zero]=1/skewness; sample_attempts: if exact_size=True, sets number of re-sampling attempts to get sample of continuous experiences (no `Terminal` frames inside except last one); if number is reached - sample returned 'as is'. Returns: instance of Rollout(). """ if size is None: size = self.priority_sample_size if size > self.max_sample_size: size = self.max_sample_size # Toss skewed coin: if np.random.randint(int(skewness)) == 0: from_zero = False else: from_zero = True if len(self._zero_reward_indices) == 0: # zero rewards container was empty from_zero = False elif len(self._non_zero_reward_indices) == 0: # non zero rewards container was empty from_zero = True # Try to sample sequence of given length from one episode. # Take maximum of 'sample_attempts', if no luck # (e.g too short episodes and/or too big sampling size) -> # return inconsistent sample and issue warning. check_sequence = True for attempt in range(sample_attempts): if from_zero: index = np.random.randint(len(self._zero_reward_indices)) end_frame_index = self._zero_reward_indices[index] else: index = np.random.randint(len(self._non_zero_reward_indices)) end_frame_index = self._non_zero_reward_indices[index] start_frame_index = end_frame_index - size + 1 raw_start_frame_index = start_frame_index - self._top_frame_index sampled_rollout = Rollout() is_full = True if attempt == sample_attempts - 1: check_sequence = False self.log.warning( 'Memory_{}: failed to sample {} successive frames, sampled as is.' .format(self.task, size)) for i in range(size - 1): frame = self._frames[raw_start_frame_index + i] sampled_rollout.add(frame) if check_sequence: if frame['terminal']: if exact_size: is_full = False #print('attempt:', attempt) #print('frame.terminal:', frame['terminal']) break # Last frame can be terminal anyway: frame = self._frames[raw_start_frame_index + size - 1] sampled_rollout.add(frame) if is_full: break return sampled_rollout