def __init__(self, brain, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The The identifier of the current run """ super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.policy = BCPolicy(seed, brain, trainer_parameters, load) self.n_sequences = 1 self.cumulative_rewards = {} self.episode_steps = {} self.stats = {'Losses/Cloning Loss': [], 'Environment/Episode Length': [], 'Environment/Cumulative Reward': []} self.summary_path = trainer_parameters['summary_path'] self.batches_per_epoch = trainer_parameters['batches_per_epoch'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.demonstration_buffer = Buffer() self.evaluation_buffer = Buffer() self.summary_writer = tf.summary.FileWriter(self.summary_path)
def __init__(self, sess, brain, reward_buff_cap, trainer_parameters, training, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param sess: Tensorflow session. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. """ super(PPOTrainer, self).__init__(sess, brain.brain_name, trainer_parameters, training, run_id) self.param_keys = [ 'batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd', 'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers', 'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent', 'graph_scope', 'summary_path', 'memory_size', 'use_curiosity', 'curiosity_strength', 'curiosity_enc_size' ] for k in self.param_keys: if k not in trainer_parameters: raise UnityTrainerException( "The hyperparameter {0} could not be found for the PPO trainer of " "brain {1}.".format(k, brain.brain_name)) self.use_curiosity = bool(trainer_parameters['use_curiosity']) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, sess, self.is_training) stats = { 'cumulative_reward': [], 'episode_length': [], 'value_estimate': [], 'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': [] } if self.use_curiosity: stats['forward_loss'] = [] stats['inverse_loss'] = [] stats['intrinsic_reward'] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.training_buffer2 = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {} self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path)
def __init__(self, brain, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.policy = BCPolicy(seed, brain, trainer_parameters, load) self.n_sequences = 1 self.cumulative_rewards = {} self.episode_steps = {} self.stats = { "Losses/Cloning Loss": [], "Environment/Episode Length": [], "Environment/Cumulative Reward": [], } self.batches_per_epoch = trainer_parameters["batches_per_epoch"] self.demonstration_buffer = Buffer() self.evaluation_buffer = Buffer()
def make_demo_buffer( brain_infos: List[BrainInfo], brain_params: BrainParameters, sequence_length: int ) -> Buffer: # Create and populate buffer using experiences demo_buffer = Buffer() for idx, experience in enumerate(brain_infos): if idx > len(brain_infos) - 2: break current_brain_info = brain_infos[idx] next_brain_info = brain_infos[idx + 1] demo_buffer[0].last_brain_info = current_brain_info demo_buffer[0]["done"].append(next_brain_info.local_done[0]) demo_buffer[0]["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): demo_buffer[0]["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0] ) if brain_params.vector_observation_space_size > 0: demo_buffer[0]["vector_obs"].append( current_brain_info.vector_observations[0] ) demo_buffer[0]["actions"].append(next_brain_info.previous_vector_actions[0]) demo_buffer[0]["prev_action"].append( current_brain_info.previous_vector_actions[0] ) if next_brain_info.local_done[0]: demo_buffer.append_update_buffer( 0, batch_size=None, training_length=sequence_length ) demo_buffer.reset_local_buffers() demo_buffer.append_update_buffer( 0, batch_size=None, training_length=sequence_length ) return demo_buffer
def __init__(self, sess, brain, trainer_parameters, training, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param sess: Tensorflow session. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. """ super(BehavioralCloningTrainer, self).__init__(sess, brain, trainer_parameters, training, run_id) self.param_keys = ['brain_to_imitate', 'batch_size', 'time_horizon', 'graph_scope', 'summary_freq', 'max_steps', 'batches_per_epoch', 'use_recurrent', 'hidden_units','learning_rate', 'num_layers', 'sequence_length', 'memory_size'] for k in self.param_keys: if k not in trainer_parameters: raise UnityTrainerException("The hyperparameter {0} could not be found for the Imitation trainer of " "brain {1}.".format(k, brain.brain_name)) self.policy = BCPolicy(seed, brain, trainer_parameters, sess) self.brain_name = brain.brain_name self.brain_to_imitate = trainer_parameters['brain_to_imitate'] self.batches_per_epoch = trainer_parameters['batches_per_epoch'] self.n_sequences = max(int(trainer_parameters['batch_size'] / self.policy.sequence_length), 1) self.cumulative_rewards = {} self.episode_steps = {} self.stats = {'losses': [], 'episode_length': [], 'cumulative_reward': []} self.training_buffer = Buffer() self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path)
def test_buffer(): b = Buffer() for fake_agent_id in range(4): for step in range(9): b[fake_agent_id]['vector_observation'].append( [100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3] ) b[fake_agent_id]['action'].append([100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5]) a = b[1]['vector_observation'].get_batch(batch_size=2, training_length=1, sequential=True) assert_array(a, np.array([[171, 172, 173], [181, 182, 183]])) a = b[2]['vector_observation'].get_batch(batch_size=2, training_length=3, sequential=True) assert_array(a, np.array([ [[231, 232, 233], [241, 242, 243], [251, 252, 253]], [[261, 262, 263], [271, 272, 273], [281, 282, 283]] ])) a = b[2]['vector_observation'].get_batch(batch_size=2, training_length=3, sequential=False) assert_array(a, np.array([ [[251, 252, 253], [261, 262, 263], [271, 272, 273]], [[261, 262, 263], [271, 272, 273], [281, 282, 283]] ])) b[4].reset_agent() assert len(b[4]) == 0 b.append_update_buffer(3, batch_size=None, training_length=2) b.append_update_buffer(2, batch_size=None, training_length=2) assert len(b.update_buffer['action']) == 10 assert np.array(b.update_buffer['action']).shape == (10, 2, 2) c = b.update_buffer.make_mini_batch(start=0, end=1) assert c.keys() == b.update_buffer.keys() assert c['action'].shape == (1, 2, 2)
def make_demo_buffer(brain_infos, brain_params, sequence_length): # Create and populate buffer using experiences demo_buffer = Buffer() for idx, experience in enumerate(brain_infos): if idx > len(brain_infos) - 2: break current_brain_info = brain_infos[idx] next_brain_info = brain_infos[idx + 1] demo_buffer[0].last_brain_info = current_brain_info demo_buffer[0]['done'].append(next_brain_info.local_done[0]) demo_buffer[0]['rewards'].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): demo_buffer[0]['visual_obs%d' % i] \ .append(current_brain_info.visual_observations[i][0]) if brain_params.vector_observation_space_size > 0: demo_buffer[0]['vector_obs'] \ .append(current_brain_info.vector_observations[0]) demo_buffer[0]['actions'].append(next_brain_info.previous_vector_actions[0]) if next_brain_info.local_done[0]: demo_buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) demo_buffer.reset_local_buffers() demo_buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) return demo_buffer
def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param reward_buff_cap: Max reward history to track in the reward buffer :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super().__init__(brain, trainer_parameters, training, run_id, reward_buff_cap) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "model_path", "reward_signals", ] self.check_param_keys() # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = defaultdict(list) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} for _reward_signal in self.policy.reward_signals.keys(): self.collected_rewards[_reward_signal] = {} self.stats = stats self.training_buffer = Buffer() self.episode_steps = {}
def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The The identifier of the current run """ super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.param_keys = [ 'batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd', 'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers', 'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent', 'summary_path', 'memory_size', 'use_curiosity', 'curiosity_strength', 'curiosity_enc_size', 'model_path' ] self.check_param_keys() self.use_curiosity = bool(trainer_parameters['use_curiosity']) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = { 'Environment/Cumulative Reward': [], 'Environment/Episode Length': [], 'Policy/Value Estimate': [], 'Policy/Entropy': [], 'Losses/Value Loss': [], 'Losses/Policy Loss': [], 'Policy/Learning Rate': [] } if self.use_curiosity: stats['Losses/Forward Loss'] = [] stats['Losses/Inverse Loss'] = [] stats['Policy/Curiosity Reward'] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {} self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path)
def __init__(self, *args, **kwargs): super(RLTrainer, self).__init__(*args, **kwargs) # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} self.training_buffer = Buffer() self.episode_steps = {}
def construct_fake_buffer(): b = Buffer() for fake_agent_id in range(4): for step in range(9): b[fake_agent_id]["vector_observation"].append([ 100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3, ]) b[fake_agent_id]["action"].append([ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5, ]) return b
def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8): buffer = Buffer() # Make a buffer for idx, experience in enumerate(brain_infos): if idx > len(brain_infos) - 2: break current_brain_info = brain_infos[idx] next_brain_info = brain_infos[idx + 1] buffer[0].last_brain_info = current_brain_info buffer[0]["done"].append(next_brain_info.local_done[0]) buffer[0]["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): buffer[0]["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0]) buffer[0]["next_visual_obs%d" % i].append( current_brain_info.visual_observations[i][0]) if brain_params.vector_observation_space_size > 0: buffer[0]["vector_obs"].append( current_brain_info.vector_observations[0]) buffer[0]["next_vector_in"].append( current_brain_info.vector_observations[0]) fake_action_size = len(brain_params.vector_action_space_size) if brain_params.vector_action_space_type == "continuous": fake_action_size = brain_params.vector_action_space_size[0] buffer[0]["actions"].append(np.zeros(fake_action_size)) buffer[0]["prev_action"].append(np.zeros(fake_action_size)) buffer[0]["masks"].append(1.0) buffer[0]["advantages"].append(1.0) if brain_params.vector_action_space_type == "discrete": buffer[0]["action_probs"].append( np.ones(sum(brain_params.vector_action_space_size))) else: buffer[0]["action_probs"].append( np.ones(buffer[0]["actions"][0].shape)) buffer[0]["actions_pre"].append(np.ones(buffer[0]["actions"][0].shape)) buffer[0]["random_normal_epsilon"].append( np.ones(buffer[0]["actions"][0].shape)) buffer[0]["action_mask"].append( np.ones(np.sum(brain_params.vector_action_space_size))) buffer[0]["memory"].append(np.ones(memory_size)) buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) return buffer
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], brain_params: BrainParameters, sequence_length: int, ) -> Buffer: # Create and populate buffer using experiences demo_buffer = Buffer() for idx, experience in enumerate(pair_infos): if idx > len(pair_infos) - 2: break current_pair_info = pair_infos[idx] next_pair_info = pair_infos[idx + 1] current_brain_info = BrainInfo.from_agent_proto( 0, [current_pair_info.agent_info], brain_params) next_brain_info = BrainInfo.from_agent_proto( 0, [next_pair_info.agent_info], brain_params) previous_action = np.array( pair_infos[idx].action_info.vector_actions) * 0 if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions) demo_buffer[0].last_brain_info = current_brain_info demo_buffer[0]["done"].append(next_brain_info.local_done[0]) demo_buffer[0]["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): demo_buffer[0]["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0]) if brain_params.vector_observation_space_size > 0: demo_buffer[0]["vector_obs"].append( current_brain_info.vector_observations[0]) demo_buffer[0]["actions"].append( current_pair_info.action_info.vector_actions) demo_buffer[0]["prev_action"].append(previous_action) if next_brain_info.local_done[0]: demo_buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) demo_buffer.reset_local_buffers() demo_buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) return demo_buffer
def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "gamma", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "use_curiosity", "curiosity_strength", "curiosity_enc_size", "model_path", ] self.check_param_keys() self.use_curiosity = bool(trainer_parameters["use_curiosity"]) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = { "Environment/Cumulative Reward": [], "Environment/Episode Length": [], "Policy/Value Estimate": [], "Policy/Entropy": [], "Losses/Value Loss": [], "Losses/Policy Loss": [], "Policy/Learning Rate": [], } if self.use_curiosity: stats["Losses/Forward Loss"] = [] stats["Losses/Inverse Loss"] = [] stats["Policy/Curiosity Reward"] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {}
def demo_to_buffer(file_path: str, sequence_length: int) -> Tuple[BrainParameters, Buffer]: """ Loads demonstration file and uses it to fill training buffer. :param file_path: Location of demonstration file (.demo). :param sequence_length: Length of trajectories to fill buffer. :return: """ # early exit if inference mode # export EVALUATION_STAGE='testing' EVALUATION_STAGE = os.getenv('EVALUATION_STAGE', '') if EVALUATION_STAGE == 'testing': demo_buffer = Buffer() brain_params = MineRLToMLAgentWrapper.get_brain_params(file_path) return brain_params, demo_buffer # # The dataset is available in data/ directory from repository root. # MINERL_DATA_ROOT = os.getenv('MINERL_DATA_ROOT', 'data/') logger.info("Building data pipeline for {}".format(file_path)) data = minerl.data.make(file_path) report_trajs = [] trajs = [ 'v1_other_pomegranite_orc-12_24007-29518', 'v1_right_mushroom_fire-breathing_dragon_41653-47509', 'v1_juvenile_apple_angel-7_205561-212353', 'v1_juvenile_apple_angel-6_221-11831', 'v1_equal_olive_chimera-7_10379-19453', 'v1_unselfish_blood_orange_savage-18_19656-23843', 'v1_other_pomegranite_orc-12_31579-36826', 'v1_svelte_cherry_devil-17_314-11959', 'v1_agonizing_kale_tree_nymph-7_133235-141843', 'v1_unselfish_blood_orange_savage-18_14639-19416', 'v1_unselfish_blood_orange_savage-18_399-10066', 'v1_courageous_rutabaga_nessie-1_3069-13764', 'v1_agonizing_kale_tree_nymph-20_289-7919', 'v1_right_mushroom_fire-breathing_dragon_88565-95177', 'v1_last_prune_swamp_monster-2_2208-8442', 'v1_excellent_mango_beast-6_43472-48953', 'v1_bogus_guava_djinn-17_23146-31716', 'v1_splendid_brussels_sprout_pegasus-5_45696-54118', 'v1_agonizing_kale_tree_nymph-7_106750-114380', 'v1_right_mushroom_fire-breathing_dragon_7211-17977', 'v1_agonizing_kale_tree_nymph-20_7989-16044', 'v1_excellent_mango_beast-6_20909-29943', 'v1_villainous_black_eyed_peas_loch_ness_monster-1_82621-93105', 'v1_subtle_iceberg_lettuce_nymph-4_16111-20545', 'v1_agonizing_kale_tree_nymph-7_74962-82761', 'v1_juvenile_apple_angel-5_4254-15273', 'v1_conscious_tangerine_rain_bird-23_48769-59333', 'v1_absolute_grape_changeling-6_37339-46767', 'v1_equal_olive_chimera-9_14563-24740', 'v1_juvenile_apple_angel-7_158092-167444', 'v1_bogus_guava_djinn-2_19159-30071', 'v1_other_pomegranite_orc-12_16800-22992' ] # trajs = data.get_trajectory_names() all_demo = dict() brain_infos = [] brain_params = MineRLToMLAgentWrapper.get_brain_params(file_path) agent_id = 'fake_id' # stream_name = random.choice(trajs) for stream_name in trajs: demo = Object() logger.info("Loading data for {}...".format(stream_name)) demo.data_frames = list( data.load_data(stream_name, include_metadata=True)) demo.meta = demo.data_frames[0][-1] cum_rewards = np.cumsum([x[2] for x in demo.data_frames]) demo.file_len = len(demo.data_frames) logger.info("Data loading complete!".format(stream_name)) logger.info("META DATA: {}".format(demo.meta)) demo.height, demo.width = data.observation_space.spaces[ 'pov'].shape[:2] # all_demo[stream_name]=demo if not demo.meta['success']: logger.info("SKIP as success=False") continue if int(demo.meta['duration_steps']) > 12000: logger.info("****HACK**** SKIP as > 12k steps") continue if int(demo.meta['total_reward']) < 1024: logger.info( "ERROR score must be > 1024 because of dimond = 1024 points") continue logger.info("*** PASSED CHECKS ****") report_trajs.append(stream_name) running_reward = 0 for i, frame in enumerate(demo.data_frames): ob = frame[0] action = frame[1] # action=np.hstack([v for v in action.values()]) reward = float(frame[2]) ob = frame[3] done = frame[4] meta_data = frame[5] running_reward += reward info = { 'stream_name': meta_data['stream_name'], 'duration_steps': meta_data['duration_steps'], 'total_reward': meta_data['total_reward'], 'success': meta_data['success'], 'step': i, 'running_reward': running_reward } max_reached = i + 1 == meta_data['duration_steps'] brain_info = MineRLToMLAgentWrapper.create_brain_info( ob=ob, agent_id=agent_id, brain_params=brain_params, reward=reward, done=done, info=info, action=action, max_reached=max_reached) brain_info = MineRLToMLAgentWrapper.process_brain_info_through_wrapped_envs( file_path, brain_info) brain_infos.append(brain_info) del frame[3] # obs, free for memory del demo.data_frames del demo import gc gc.collect() # brain_params, brain_infos, _ = load_demonstration(file_path) demo_buffer = make_demo_buffer(brain_infos, brain_params, sequence_length) del brain_infos import gc gc.collect() logger.info("report_trajs = " + str([str(i) for i in report_trajs])) return brain_params, demo_buffer