예제 #1
0
    def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
        :param run_id: The The identifier of the current run
        """
        super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id)
        self.policy = BCPolicy(seed, brain, trainer_parameters, load)
        self.n_sequences = 1
        self.cumulative_rewards = {}
        self.episode_steps = {}
        self.stats = {'Losses/Cloning Loss': [], 'Environment/Episode Length': [],
                      'Environment/Cumulative Reward': []}

        self.summary_path = trainer_parameters['summary_path']
        self.batches_per_epoch = trainer_parameters['batches_per_epoch']
        if not os.path.exists(self.summary_path):
            os.makedirs(self.summary_path)

        self.demonstration_buffer = Buffer()
        self.evaluation_buffer = Buffer()
        self.summary_writer = tf.summary.FileWriter(self.summary_path)
예제 #2
0
    def __init__(self, sess, brain, reward_buff_cap, trainer_parameters,
                 training, seed, run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param sess: Tensorflow session.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        """
        super(PPOTrainer, self).__init__(sess, brain.brain_name,
                                         trainer_parameters, training, run_id)

        self.param_keys = [
            'batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma',
            'hidden_units', 'lambd', 'learning_rate', 'max_steps', 'normalize',
            'num_epoch', 'num_layers', 'time_horizon', 'sequence_length',
            'summary_freq', 'use_recurrent', 'graph_scope', 'summary_path',
            'memory_size', 'use_curiosity', 'curiosity_strength',
            'curiosity_enc_size'
        ]

        for k in self.param_keys:
            if k not in trainer_parameters:
                raise UnityTrainerException(
                    "The hyperparameter {0} could not be found for the PPO trainer of "
                    "brain {1}.".format(k, brain.brain_name))

        self.use_curiosity = bool(trainer_parameters['use_curiosity'])

        self.step = 0

        self.policy = PPOPolicy(seed, brain, trainer_parameters, sess,
                                self.is_training)

        stats = {
            'cumulative_reward': [],
            'episode_length': [],
            'value_estimate': [],
            'entropy': [],
            'value_loss': [],
            'policy_loss': [],
            'learning_rate': []
        }
        if self.use_curiosity:
            stats['forward_loss'] = []
            stats['inverse_loss'] = []
            stats['intrinsic_reward'] = []
            self.intrinsic_rewards = {}
        self.stats = stats

        self.training_buffer = Buffer()
        self.training_buffer2 = Buffer()

        self.cumulative_rewards = {}
        self._reward_buffer = deque(maxlen=reward_buff_cap)
        self.episode_steps = {}
        self.summary_path = trainer_parameters['summary_path']
        if not os.path.exists(self.summary_path):
            os.makedirs(self.summary_path)

        self.summary_writer = tf.summary.FileWriter(self.summary_path)
예제 #3
0
    def __init__(self, brain, trainer_parameters, training, load, seed,
                 run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
        :param run_id: The identifier of the current run
        """
        super(BCTrainer, self).__init__(brain, trainer_parameters, training,
                                        run_id)
        self.policy = BCPolicy(seed, brain, trainer_parameters, load)
        self.n_sequences = 1
        self.cumulative_rewards = {}
        self.episode_steps = {}
        self.stats = {
            "Losses/Cloning Loss": [],
            "Environment/Episode Length": [],
            "Environment/Cumulative Reward": [],
        }

        self.batches_per_epoch = trainer_parameters["batches_per_epoch"]

        self.demonstration_buffer = Buffer()
        self.evaluation_buffer = Buffer()
def make_demo_buffer(
    brain_infos: List[BrainInfo], brain_params: BrainParameters, sequence_length: int
) -> Buffer:
    # Create and populate buffer using experiences
    demo_buffer = Buffer()
    for idx, experience in enumerate(brain_infos):
        if idx > len(brain_infos) - 2:
            break
        current_brain_info = brain_infos[idx]
        next_brain_info = brain_infos[idx + 1]
        demo_buffer[0].last_brain_info = current_brain_info
        demo_buffer[0]["done"].append(next_brain_info.local_done[0])
        demo_buffer[0]["rewards"].append(next_brain_info.rewards[0])
        for i in range(brain_params.number_visual_observations):
            demo_buffer[0]["visual_obs%d" % i].append(
                current_brain_info.visual_observations[i][0]
            )
        if brain_params.vector_observation_space_size > 0:
            demo_buffer[0]["vector_obs"].append(
                current_brain_info.vector_observations[0]
            )
        demo_buffer[0]["actions"].append(next_brain_info.previous_vector_actions[0])
        demo_buffer[0]["prev_action"].append(
            current_brain_info.previous_vector_actions[0]
        )
        if next_brain_info.local_done[0]:
            demo_buffer.append_update_buffer(
                0, batch_size=None, training_length=sequence_length
            )
            demo_buffer.reset_local_buffers()
    demo_buffer.append_update_buffer(
        0, batch_size=None, training_length=sequence_length
    )
    return demo_buffer
예제 #5
0
    def __init__(self, sess, brain, trainer_parameters, training, seed, run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param sess: Tensorflow session.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        """
        super(BehavioralCloningTrainer, self).__init__(sess, brain, trainer_parameters, training, run_id)

        self.param_keys = ['brain_to_imitate', 'batch_size', 'time_horizon',
                           'graph_scope', 'summary_freq', 'max_steps',
                           'batches_per_epoch', 'use_recurrent',
                           'hidden_units','learning_rate', 'num_layers',
                           'sequence_length', 'memory_size']

        for k in self.param_keys:
            if k not in trainer_parameters:
                raise UnityTrainerException("The hyperparameter {0} could not be found for the Imitation trainer of "
                                            "brain {1}.".format(k, brain.brain_name))

        self.policy = BCPolicy(seed, brain, trainer_parameters, sess)
        self.brain_name = brain.brain_name
        self.brain_to_imitate = trainer_parameters['brain_to_imitate']
        self.batches_per_epoch = trainer_parameters['batches_per_epoch']
        self.n_sequences = max(int(trainer_parameters['batch_size'] / self.policy.sequence_length), 1)
        self.cumulative_rewards = {}
        self.episode_steps = {}
        self.stats = {'losses': [], 'episode_length': [], 'cumulative_reward': []}

        self.training_buffer = Buffer()
        self.summary_path = trainer_parameters['summary_path']
        if not os.path.exists(self.summary_path):
            os.makedirs(self.summary_path)

        self.summary_writer = tf.summary.FileWriter(self.summary_path)
예제 #6
0
def test_buffer():
    b = Buffer()
    for fake_agent_id in range(4):
        for step in range(9):
            b[fake_agent_id]['vector_observation'].append(
                [100 * fake_agent_id + 10 * step + 1,
                 100 * fake_agent_id + 10 * step + 2,
                 100 * fake_agent_id + 10 * step + 3]
            )
            b[fake_agent_id]['action'].append([100 * fake_agent_id + 10 * step + 4,
                                               100 * fake_agent_id + 10 * step + 5])
    a = b[1]['vector_observation'].get_batch(batch_size=2, training_length=1, sequential=True)
    assert_array(a, np.array([[171, 172, 173], [181, 182, 183]]))
    a = b[2]['vector_observation'].get_batch(batch_size=2, training_length=3, sequential=True)
    assert_array(a, np.array([
        [[231, 232, 233], [241, 242, 243], [251, 252, 253]],
        [[261, 262, 263], [271, 272, 273], [281, 282, 283]]
    ]))
    a = b[2]['vector_observation'].get_batch(batch_size=2, training_length=3, sequential=False)
    assert_array(a, np.array([
        [[251, 252, 253], [261, 262, 263], [271, 272, 273]],
        [[261, 262, 263], [271, 272, 273], [281, 282, 283]]
    ]))
    b[4].reset_agent()
    assert len(b[4]) == 0
    b.append_update_buffer(3, batch_size=None, training_length=2)
    b.append_update_buffer(2, batch_size=None, training_length=2)
    assert len(b.update_buffer['action']) == 10
    assert np.array(b.update_buffer['action']).shape == (10, 2, 2)

    c = b.update_buffer.make_mini_batch(start=0, end=1)
    assert c.keys() == b.update_buffer.keys()
    assert c['action'].shape == (1, 2, 2)
예제 #7
0
def make_demo_buffer(brain_infos, brain_params, sequence_length):
    # Create and populate buffer using experiences
    demo_buffer = Buffer()
    for idx, experience in enumerate(brain_infos):
        if idx > len(brain_infos) - 2:
            break
        current_brain_info = brain_infos[idx]
        next_brain_info = brain_infos[idx + 1]
        demo_buffer[0].last_brain_info = current_brain_info
        demo_buffer[0]['done'].append(next_brain_info.local_done[0])
        demo_buffer[0]['rewards'].append(next_brain_info.rewards[0])
        for i in range(brain_params.number_visual_observations):
            demo_buffer[0]['visual_obs%d' % i] \
                .append(current_brain_info.visual_observations[i][0])
        if brain_params.vector_observation_space_size > 0:
            demo_buffer[0]['vector_obs'] \
                .append(current_brain_info.vector_observations[0])
        demo_buffer[0]['actions'].append(next_brain_info.previous_vector_actions[0])
        if next_brain_info.local_done[0]:
            demo_buffer.append_update_buffer(0, batch_size=None,
                                             training_length=sequence_length)
            demo_buffer.reset_local_buffers()
    demo_buffer.append_update_buffer(0, batch_size=None,
                                     training_length=sequence_length)
    return demo_buffer
    def __init__(self, brain, reward_buff_cap, trainer_parameters, training,
                 load, seed, run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param trainer_parameters: The parameters for the trainer (dictionary).
        :param reward_buff_cap: Max reward history to track in the reward buffer
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
        :param run_id: The identifier of the current run
        """
        super().__init__(brain, trainer_parameters, training, run_id,
                         reward_buff_cap)
        self.param_keys = [
            "batch_size",
            "beta",
            "buffer_size",
            "epsilon",
            "hidden_units",
            "lambd",
            "learning_rate",
            "max_steps",
            "normalize",
            "num_epoch",
            "num_layers",
            "time_horizon",
            "sequence_length",
            "summary_freq",
            "use_recurrent",
            "summary_path",
            "memory_size",
            "model_path",
            "reward_signals",
        ]
        self.check_param_keys()

        # Make sure we have at least one reward_signal
        if not self.trainer_parameters["reward_signals"]:
            raise UnityTrainerException(
                "No reward signals were defined. At least one must be used with {}."
                .format(self.__class__.__name__))

        self.step = 0
        self.policy = PPOPolicy(seed, brain, trainer_parameters,
                                self.is_training, load)

        stats = defaultdict(list)
        # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
        # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
        # of what reward signals are actually present.
        self.collected_rewards = {"environment": {}}
        for _reward_signal in self.policy.reward_signals.keys():
            self.collected_rewards[_reward_signal] = {}

        self.stats = stats

        self.training_buffer = Buffer()
        self.episode_steps = {}
예제 #9
0
    def __init__(self, brain, reward_buff_cap, trainer_parameters, training,
                 load, seed, run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
        :param run_id: The The identifier of the current run
        """
        super(PPOTrainer, self).__init__(brain, trainer_parameters, training,
                                         run_id)
        self.param_keys = [
            'batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma',
            'hidden_units', 'lambd', 'learning_rate', 'max_steps', 'normalize',
            'num_epoch', 'num_layers', 'time_horizon', 'sequence_length',
            'summary_freq', 'use_recurrent', 'summary_path', 'memory_size',
            'use_curiosity', 'curiosity_strength', 'curiosity_enc_size',
            'model_path'
        ]

        self.check_param_keys()
        self.use_curiosity = bool(trainer_parameters['use_curiosity'])
        self.step = 0
        self.policy = PPOPolicy(seed, brain, trainer_parameters,
                                self.is_training, load)

        stats = {
            'Environment/Cumulative Reward': [],
            'Environment/Episode Length': [],
            'Policy/Value Estimate': [],
            'Policy/Entropy': [],
            'Losses/Value Loss': [],
            'Losses/Policy Loss': [],
            'Policy/Learning Rate': []
        }
        if self.use_curiosity:
            stats['Losses/Forward Loss'] = []
            stats['Losses/Inverse Loss'] = []
            stats['Policy/Curiosity Reward'] = []
            self.intrinsic_rewards = {}
        self.stats = stats

        self.training_buffer = Buffer()
        self.cumulative_rewards = {}
        self._reward_buffer = deque(maxlen=reward_buff_cap)
        self.episode_steps = {}
        self.summary_path = trainer_parameters['summary_path']
        if not os.path.exists(self.summary_path):
            os.makedirs(self.summary_path)

        self.summary_writer = tf.summary.FileWriter(self.summary_path)
예제 #10
0
 def __init__(self, *args, **kwargs):
     super(RLTrainer, self).__init__(*args, **kwargs)
     # Make sure we have at least one reward_signal
     if not self.trainer_parameters["reward_signals"]:
         raise UnityTrainerException(
             "No reward signals were defined. At least one must be used with {}."
             .format(self.__class__.__name__))
     # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
     # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
     # of what reward signals are actually present.
     self.collected_rewards = {"environment": {}}
     self.training_buffer = Buffer()
     self.episode_steps = {}
예제 #11
0
def construct_fake_buffer():
    b = Buffer()
    for fake_agent_id in range(4):
        for step in range(9):
            b[fake_agent_id]["vector_observation"].append([
                100 * fake_agent_id + 10 * step + 1,
                100 * fake_agent_id + 10 * step + 2,
                100 * fake_agent_id + 10 * step + 3,
            ])
            b[fake_agent_id]["action"].append([
                100 * fake_agent_id + 10 * step + 4,
                100 * fake_agent_id + 10 * step + 5,
            ])
    return b
예제 #12
0
def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8):
    buffer = Buffer()
    # Make a buffer
    for idx, experience in enumerate(brain_infos):
        if idx > len(brain_infos) - 2:
            break
        current_brain_info = brain_infos[idx]
        next_brain_info = brain_infos[idx + 1]
        buffer[0].last_brain_info = current_brain_info
        buffer[0]["done"].append(next_brain_info.local_done[0])
        buffer[0]["rewards"].append(next_brain_info.rewards[0])
        for i in range(brain_params.number_visual_observations):
            buffer[0]["visual_obs%d" % i].append(
                current_brain_info.visual_observations[i][0])
            buffer[0]["next_visual_obs%d" % i].append(
                current_brain_info.visual_observations[i][0])
        if brain_params.vector_observation_space_size > 0:
            buffer[0]["vector_obs"].append(
                current_brain_info.vector_observations[0])
            buffer[0]["next_vector_in"].append(
                current_brain_info.vector_observations[0])
        fake_action_size = len(brain_params.vector_action_space_size)
        if brain_params.vector_action_space_type == "continuous":
            fake_action_size = brain_params.vector_action_space_size[0]
        buffer[0]["actions"].append(np.zeros(fake_action_size))
        buffer[0]["prev_action"].append(np.zeros(fake_action_size))
        buffer[0]["masks"].append(1.0)
        buffer[0]["advantages"].append(1.0)
        if brain_params.vector_action_space_type == "discrete":
            buffer[0]["action_probs"].append(
                np.ones(sum(brain_params.vector_action_space_size)))
        else:
            buffer[0]["action_probs"].append(
                np.ones(buffer[0]["actions"][0].shape))
        buffer[0]["actions_pre"].append(np.ones(buffer[0]["actions"][0].shape))
        buffer[0]["random_normal_epsilon"].append(
            np.ones(buffer[0]["actions"][0].shape))
        buffer[0]["action_mask"].append(
            np.ones(np.sum(brain_params.vector_action_space_size)))
        buffer[0]["memory"].append(np.ones(memory_size))

    buffer.append_update_buffer(0,
                                batch_size=None,
                                training_length=sequence_length)
    return buffer
예제 #13
0
def make_demo_buffer(
    pair_infos: List[AgentInfoActionPairProto],
    brain_params: BrainParameters,
    sequence_length: int,
) -> Buffer:
    # Create and populate buffer using experiences
    demo_buffer = Buffer()
    for idx, experience in enumerate(pair_infos):
        if idx > len(pair_infos) - 2:
            break
        current_pair_info = pair_infos[idx]
        next_pair_info = pair_infos[idx + 1]
        current_brain_info = BrainInfo.from_agent_proto(
            0, [current_pair_info.agent_info], brain_params)
        next_brain_info = BrainInfo.from_agent_proto(
            0, [next_pair_info.agent_info], brain_params)
        previous_action = np.array(
            pair_infos[idx].action_info.vector_actions) * 0
        if idx > 0:
            previous_action = np.array(
                pair_infos[idx - 1].action_info.vector_actions)
        demo_buffer[0].last_brain_info = current_brain_info
        demo_buffer[0]["done"].append(next_brain_info.local_done[0])
        demo_buffer[0]["rewards"].append(next_brain_info.rewards[0])
        for i in range(brain_params.number_visual_observations):
            demo_buffer[0]["visual_obs%d" % i].append(
                current_brain_info.visual_observations[i][0])
        if brain_params.vector_observation_space_size > 0:
            demo_buffer[0]["vector_obs"].append(
                current_brain_info.vector_observations[0])
        demo_buffer[0]["actions"].append(
            current_pair_info.action_info.vector_actions)
        demo_buffer[0]["prev_action"].append(previous_action)
        if next_brain_info.local_done[0]:
            demo_buffer.append_update_buffer(0,
                                             batch_size=None,
                                             training_length=sequence_length)
            demo_buffer.reset_local_buffers()
    demo_buffer.append_update_buffer(0,
                                     batch_size=None,
                                     training_length=sequence_length)
    return demo_buffer
예제 #14
0
    def __init__(self, brain, reward_buff_cap, trainer_parameters, training,
                 load, seed, run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
        :param run_id: The identifier of the current run
        """
        super(PPOTrainer, self).__init__(brain, trainer_parameters, training,
                                         run_id)
        self.param_keys = [
            "batch_size",
            "beta",
            "buffer_size",
            "epsilon",
            "gamma",
            "hidden_units",
            "lambd",
            "learning_rate",
            "max_steps",
            "normalize",
            "num_epoch",
            "num_layers",
            "time_horizon",
            "sequence_length",
            "summary_freq",
            "use_recurrent",
            "summary_path",
            "memory_size",
            "use_curiosity",
            "curiosity_strength",
            "curiosity_enc_size",
            "model_path",
        ]

        self.check_param_keys()
        self.use_curiosity = bool(trainer_parameters["use_curiosity"])
        self.step = 0
        self.policy = PPOPolicy(seed, brain, trainer_parameters,
                                self.is_training, load)

        stats = {
            "Environment/Cumulative Reward": [],
            "Environment/Episode Length": [],
            "Policy/Value Estimate": [],
            "Policy/Entropy": [],
            "Losses/Value Loss": [],
            "Losses/Policy Loss": [],
            "Policy/Learning Rate": [],
        }
        if self.use_curiosity:
            stats["Losses/Forward Loss"] = []
            stats["Losses/Inverse Loss"] = []
            stats["Policy/Curiosity Reward"] = []
            self.intrinsic_rewards = {}
        self.stats = stats

        self.training_buffer = Buffer()
        self.cumulative_rewards = {}
        self._reward_buffer = deque(maxlen=reward_buff_cap)
        self.episode_steps = {}
def demo_to_buffer(file_path: str,
                   sequence_length: int) -> Tuple[BrainParameters, Buffer]:
    """
    Loads demonstration file and uses it to fill training buffer.
    :param file_path: Location of demonstration file (.demo).
    :param sequence_length: Length of trajectories to fill buffer.
    :return:
    """

    # early exit if inference mode
    # export EVALUATION_STAGE='testing'
    EVALUATION_STAGE = os.getenv('EVALUATION_STAGE', '')
    if EVALUATION_STAGE == 'testing':
        demo_buffer = Buffer()
        brain_params = MineRLToMLAgentWrapper.get_brain_params(file_path)
        return brain_params, demo_buffer

    # # The dataset is available in data/ directory from repository root.
    # MINERL_DATA_ROOT = os.getenv('MINERL_DATA_ROOT', 'data/')

    logger.info("Building data pipeline for {}".format(file_path))
    data = minerl.data.make(file_path)

    report_trajs = []

    trajs = [
        'v1_other_pomegranite_orc-12_24007-29518',
        'v1_right_mushroom_fire-breathing_dragon_41653-47509',
        'v1_juvenile_apple_angel-7_205561-212353',
        'v1_juvenile_apple_angel-6_221-11831',
        'v1_equal_olive_chimera-7_10379-19453',
        'v1_unselfish_blood_orange_savage-18_19656-23843',
        'v1_other_pomegranite_orc-12_31579-36826',
        'v1_svelte_cherry_devil-17_314-11959',
        'v1_agonizing_kale_tree_nymph-7_133235-141843',
        'v1_unselfish_blood_orange_savage-18_14639-19416',
        'v1_unselfish_blood_orange_savage-18_399-10066',
        'v1_courageous_rutabaga_nessie-1_3069-13764',
        'v1_agonizing_kale_tree_nymph-20_289-7919',
        'v1_right_mushroom_fire-breathing_dragon_88565-95177',
        'v1_last_prune_swamp_monster-2_2208-8442',
        'v1_excellent_mango_beast-6_43472-48953',
        'v1_bogus_guava_djinn-17_23146-31716',
        'v1_splendid_brussels_sprout_pegasus-5_45696-54118',
        'v1_agonizing_kale_tree_nymph-7_106750-114380',
        'v1_right_mushroom_fire-breathing_dragon_7211-17977',
        'v1_agonizing_kale_tree_nymph-20_7989-16044',
        'v1_excellent_mango_beast-6_20909-29943',
        'v1_villainous_black_eyed_peas_loch_ness_monster-1_82621-93105',
        'v1_subtle_iceberg_lettuce_nymph-4_16111-20545',
        'v1_agonizing_kale_tree_nymph-7_74962-82761',
        'v1_juvenile_apple_angel-5_4254-15273',
        'v1_conscious_tangerine_rain_bird-23_48769-59333',
        'v1_absolute_grape_changeling-6_37339-46767',
        'v1_equal_olive_chimera-9_14563-24740',
        'v1_juvenile_apple_angel-7_158092-167444',
        'v1_bogus_guava_djinn-2_19159-30071',
        'v1_other_pomegranite_orc-12_16800-22992'
    ]
    # trajs = data.get_trajectory_names()

    all_demo = dict()
    brain_infos = []
    brain_params = MineRLToMLAgentWrapper.get_brain_params(file_path)
    agent_id = 'fake_id'
    # stream_name = random.choice(trajs)
    for stream_name in trajs:
        demo = Object()
        logger.info("Loading data for {}...".format(stream_name))
        demo.data_frames = list(
            data.load_data(stream_name, include_metadata=True))
        demo.meta = demo.data_frames[0][-1]
        cum_rewards = np.cumsum([x[2] for x in demo.data_frames])
        demo.file_len = len(demo.data_frames)
        logger.info("Data loading complete!".format(stream_name))
        logger.info("META DATA: {}".format(demo.meta))
        demo.height, demo.width = data.observation_space.spaces[
            'pov'].shape[:2]
        # all_demo[stream_name]=demo

        if not demo.meta['success']:
            logger.info("SKIP as success=False")
            continue
        if int(demo.meta['duration_steps']) > 12000:
            logger.info("****HACK**** SKIP as > 12k steps")
            continue
        if int(demo.meta['total_reward']) < 1024:
            logger.info(
                "ERROR score must be > 1024 because of dimond = 1024 points")
            continue

        logger.info("*** PASSED CHECKS ****")
        report_trajs.append(stream_name)

        running_reward = 0
        for i, frame in enumerate(demo.data_frames):
            ob = frame[0]
            action = frame[1]
            # action=np.hstack([v for v in action.values()])
            reward = float(frame[2])
            ob = frame[3]
            done = frame[4]
            meta_data = frame[5]
            running_reward += reward
            info = {
                'stream_name': meta_data['stream_name'],
                'duration_steps': meta_data['duration_steps'],
                'total_reward': meta_data['total_reward'],
                'success': meta_data['success'],
                'step': i,
                'running_reward': running_reward
            }
            max_reached = i + 1 == meta_data['duration_steps']
            brain_info = MineRLToMLAgentWrapper.create_brain_info(
                ob=ob,
                agent_id=agent_id,
                brain_params=brain_params,
                reward=reward,
                done=done,
                info=info,
                action=action,
                max_reached=max_reached)
            brain_info = MineRLToMLAgentWrapper.process_brain_info_through_wrapped_envs(
                file_path, brain_info)
            brain_infos.append(brain_info)

            del frame[3]  # obs, free for memory
        del demo.data_frames
        del demo
        import gc
        gc.collect()

    # brain_params, brain_infos, _ = load_demonstration(file_path)
    demo_buffer = make_demo_buffer(brain_infos, brain_params, sequence_length)

    del brain_infos
    import gc
    gc.collect()

    logger.info("report_trajs = " + str([str(i) for i in report_trajs]))

    return brain_params, demo_buffer