Пример #1
0
    def construct(variable_scope: str, devices: List[str], *args, **kwargs) -> 'GeneralTensorFlowNetwork':
        """
        Construct a network class using the provided variable scope and on requested devices
        :param variable_scope: string specifying variable scope under which to create network variables
        :param devices: list of devices (can be list of Device objects, or string for TF distributed)
        :param args: all other arguments for class initializer
        :param kwargs: all other keyword arguments for class initializer
        :return: a GeneralTensorFlowNetwork object
        """
        if len(devices) > 1:
            screen.warning("Tensorflow implementation only support a single device. Using {}".format(devices[0]))

        def construct_on_device():
            with tf.device(GeneralTensorFlowNetwork._tf_device(devices[0])):
                return GeneralTensorFlowNetwork(*args, **kwargs)

        # If variable_scope is in our dictionary, then this is not the first time that this variable_scope
        # is being used with construct(). So to avoid TF adding an incrementing number to the end of the
        # variable_scope to uniquify it, we have to both pass the previous variable_scope object to the new
        # variable_scope() call and also recover the name space using name_scope
        if variable_scope in GeneralTensorFlowNetwork.variable_scopes_dict:
            variable_scope = GeneralTensorFlowNetwork.variable_scopes_dict[variable_scope]
            with tf.variable_scope(variable_scope, auxiliary_name_scope=False) as vs:
                with tf.name_scope(vs.original_name_scope):
                    return construct_on_device()
        else:
            with tf.variable_scope(variable_scope, auxiliary_name_scope=True) as vs:
                # Add variable_scope object to dictionary for next call to construct
                GeneralTensorFlowNetwork.variable_scopes_dict[variable_scope] = vs
                return construct_on_device()
Пример #2
0
    def run_graph_manager(self, graph_manager: 'GraphManager',
                          args: argparse.Namespace):
        if args.distributed_coach and not graph_manager.agent_params.algorithm.distributed_coach_synchronization_type:
            screen.error(
                "{} algorithm is not supported using distributed Coach.".
                format(graph_manager.agent_params.algorithm))

        if args.distributed_coach and args.checkpoint_save_secs and graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.SYNC:
            screen.warning(
                "The --checkpoint_save_secs or -s argument will be ignored as SYNC distributed coach sync type is used. Checkpoint will be saved every training iteration."
            )

        if args.distributed_coach and not args.checkpoint_save_secs and graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.ASYNC:
            screen.error(
                "Distributed coach with ASYNC distributed coach sync type requires --checkpoint_save_secs or -s."
            )

        # Intel optimized TF seems to run significantly faster when limiting to a single OMP thread.
        # This will not affect GPU runs.
        os.environ["OMP_NUM_THREADS"] = "1"

        # turn TF debug prints off
        if args.framework == Frameworks.tensorflow:
            os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_verbosity)

        # turn off the summary at the end of the run if necessary
        if not args.no_summary and not args.distributed_coach:
            atexit.register(logger.summarize_experiment)
            screen.change_terminal_title(args.experiment_name)

        task_parameters = TaskParameters(
            framework_type=args.framework,
            evaluate_only=args.evaluate,
            experiment_path=args.experiment_path,
            seed=args.seed,
            use_cpu=args.use_cpu,
            checkpoint_save_secs=args.checkpoint_save_secs,
            checkpoint_restore_dir=args.checkpoint_restore_dir,
            checkpoint_save_dir=args.checkpoint_save_dir,
            export_onnx_graph=args.export_onnx_graph,
            apply_stop_condition=args.apply_stop_condition)

        # open dashboard
        if args.open_dashboard:
            open_dashboard(args.experiment_path)

        if args.distributed_coach and args.distributed_coach_run_type != RunType.ORCHESTRATOR:
            handle_distributed_coach_tasks(graph_manager, args,
                                           task_parameters)
            return

        if args.distributed_coach and args.distributed_coach_run_type == RunType.ORCHESTRATOR:
            handle_distributed_coach_orchestrator(args)
            return

        # Single-threaded runs
        if args.num_workers == 1:
            self.start_single_threaded(task_parameters, graph_manager, args)
        else:
            self.start_multi_threaded(graph_manager, args)
Пример #3
0
    def restore_checkpoint(self):
        self.verify_graph_was_created()

        # TODO: find better way to load checkpoints that were saved with a global network into the online network
        if self.task_parameters.checkpoint_restore_dir:
            if self.task_parameters.framework_type == Frameworks.tensorflow and\
                    'checkpoint' in os.listdir(self.task_parameters.checkpoint_restore_dir):
                # TODO-fixme checkpointing
                # MonitoredTrainingSession manages save/restore checkpoints autonomously. Doing so,
                # it creates it own names for the saved checkpoints, which do not match the "{}_Step-{}.ckpt" filename
                # pattern. The names used are maintained in a CheckpointState protobuf file named 'checkpoint'. Using
                # Coach's '.coach_checkpoint' protobuf file, results in an error when trying to restore the model, as
                # the checkpoint names defined do not match the actual checkpoint names.
                checkpoint = self._get_checkpoint_state_tf()
            else:
                checkpoint = get_checkpoint_state(
                    self.task_parameters.checkpoint_restore_dir)

            if checkpoint is None:
                screen.warning("No checkpoint to restore in: {}".format(
                    self.task_parameters.checkpoint_restore_dir))
            else:
                screen.log_title("Loading checkpoint: {}".format(
                    checkpoint.model_checkpoint_path))
                self.checkpoint_saver.restore(self.sess,
                                              checkpoint.model_checkpoint_path)

            [
                manager.restore_checkpoint(
                    self.task_parameters.checkpoint_restore_dir)
                for manager in self.level_managers
            ]
Пример #4
0
    def learn_from_batch(self, batch):
        # batch contains a list of episodes to learn from
        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()

        # get the values for the current states

        result = self.networks['main'].online_network.predict(batch.states(network_keys))
        current_state_values = result[0]

        self.state_values.add_sample(current_state_values)

        # the targets for the state value estimator
        num_transitions = batch.size
        state_value_head_targets = np.zeros((num_transitions, 1))

        # estimate the advantage function
        action_advantages = np.zeros((num_transitions, 1))

        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
            if batch.game_overs()[-1]:
                R = 0
            else:
                R = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]

            for i in reversed(range(num_transitions)):
                R = batch.rewards()[i] + self.ap.algorithm.discount * R
                state_value_head_targets[i] = R
                action_advantages[i] = R - current_state_values[i]

        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
            # get bootstraps
            bootstrapped_value = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]
            values = np.append(current_state_values, bootstrapped_value)
            if batch.game_overs()[-1]:
                values[-1] = 0

            # get general discounted returns table
            gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(batch.rewards(), values)
            action_advantages = np.vstack(gae_values)
        else:
            screen.warning("WARNING: The requested policy gradient rescaler is not available")

        action_advantages = action_advantages.squeeze(axis=-1)
        actions = batch.actions()
        if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) < 2:
            actions = np.expand_dims(actions, -1)

        # train
        result = self.networks['main'].online_network.accumulate_gradients({**batch.states(network_keys),
                                                                            'output_1_0': actions},
                                                                       [state_value_head_targets, action_advantages])

        # logging
        total_loss, losses, unclipped_grads = result[:3]
        self.action_advantages.add_sample(action_advantages)
        self.unclipped_grads.add_sample(unclipped_grads)
        self.value_loss.add_sample(losses[0])
        self.policy_loss.add_sample(losses[1])

        return total_loss, losses, unclipped_grads
    def load_pickled(self, file_path: str) -> None:
        """
        Restore the replay buffer contents from a pickle file.
        The pickle file is assumed to include a list of transitions.
        :param file_path: The path to a pickle file to restore
        """
        self.assert_not_frozen()

        with open(file_path, 'rb') as file:
            episodes = pickle.load(file)
            num_transitions = sum([len(e.transitions) for e in episodes])
            if num_transitions > self.max_size[1]:
                screen.warning(
                    "Warning! The number of transition to load into the replay buffer ({}) is "
                    "bigger than the max size of the replay buffer ({}). The excessive transitions will "
                    "not be stored.".format(num_transitions, self.max_size[1]))

            progress_bar = ProgressBar(len(episodes))
            for episode_idx, episode in enumerate(episodes):
                self.store_episode(episode)

                # print progress
                progress_bar.update(episode_idx)

            progress_bar.close()
Пример #6
0
    def _create_graph(self, task_parameters: TaskParameters) -> Tuple[List[LevelManager], List[Environment]]:
        if self.env_params:
            # environment loading
            self.env_params.seed = task_parameters.seed
            self.env_params.experiment_path = task_parameters.experiment_path
            env = short_dynamic_import(self.env_params.path)(**self.env_params.__dict__,
                                                             visualization_parameters=self.visualization_parameters)
        else:
            env = None

        # Only DQN variants and NEC are supported at this point.
        assert(isinstance(self.agent_params, DQNAgentParameters) or isinstance(self.agent_params, NECAgentParameters))
        # Only Episodic memories are supported,
        # for evaluating the sequential doubly robust estimator
        assert(isinstance(self.agent_params.memory, EpisodicExperienceReplayParameters))

        # agent loading
        self.agent_params.task_parameters = task_parameters  # TODO: this should probably be passed in a different way
        self.agent_params.name = "agent"
        self.agent_params.is_batch_rl_training = True
        self.agent_params.network_wrappers['main'].should_get_softmax_probabilities = True

        if 'reward_model' not in self.agent_params.network_wrappers:
            # user hasn't defined params for the reward model. we will use the same params as used for the 'main'
            # network.
            self.agent_params.network_wrappers['reward_model'] = deepcopy(self.agent_params.network_wrappers['main'])

        self.agent = short_dynamic_import(self.agent_params.path)(self.agent_params)
        agents = {'agent': self.agent}

        if not self.is_collecting_random_dataset:
            self.experience_generating_agent_params.visualization.dump_csv = False
            self.experience_generating_agent_params.task_parameters = task_parameters
            self.experience_generating_agent_params.name = "experience_gen_agent"
            self.experience_generating_agent_params.network_wrappers['main'].should_get_softmax_probabilities = True

            # we need to set these manually as these are usually being set for us only for the default agent
            self.experience_generating_agent_params.input_filter = self.agent_params.input_filter
            self.experience_generating_agent_params.output_filter = self.agent_params.output_filter

            self.experience_generating_agent = short_dynamic_import(
                self.experience_generating_agent_params.path)(self.experience_generating_agent_params)

            agents['experience_generating_agent'] = self.experience_generating_agent

        if not env and not self.agent_params.memory.load_memory_from_file_path:
            screen.warning("A BatchRLGraph requires setting a dataset to load into the agent's memory or alternatively "
                           "using an environment to create a (random) dataset from. This agent should only be used for "
                           "inference. ")
        # set level manager
        # - although we will be using each agent separately, we have to have both agents initialized together with the
        #   LevelManager, so to have them both properly initialized
        level_manager = LevelManager(agents=agents,
                                     environment=env, name="main_level",
                                     spaces_definition=self.spaces_definition)
        if env:
            return [level_manager], [env]
        else:
            return [level_manager], []
Пример #7
0
    def load_csv(self, csv_dataset: CsvDataset) -> None:
        """
        Restore the replay buffer contents from a csv file.
        The csv file is assumed to include a list of transitions.
        :param csv_dataset: A construct which holds the dataset parameters
        """
        self.assert_not_frozen()

        df = pd.read_csv(csv_dataset.filepath)
        if len(df) > self.max_size[1]:
            screen.warning(
                "Warning! The number of transitions to load into the replay buffer ({}) is "
                "bigger than the max size of the replay buffer ({}). The excessive transitions will "
                "not be stored.".format(len(df), self.max_size[1]))

        episode_ids = df['episode_id'].unique()
        progress_bar = ProgressBar(len(episode_ids))
        state_columns = [
            col for col in df.columns if col.startswith('state_feature')
        ]

        for e_id in episode_ids:
            progress_bar.update(e_id)
            df_episode_transitions = df[df['episode_id'] == e_id]
            episode = Episode()
            for (_, current_transition), (_, next_transition) in zip(
                    df_episode_transitions[:-1].iterrows(),
                    df_episode_transitions[1:].iterrows()):
                state = np.array(
                    [current_transition[col] for col in state_columns])
                next_state = np.array(
                    [next_transition[col] for col in state_columns])

                episode.insert(
                    Transition(
                        state={'observation': state},
                        action=current_transition['action'],
                        reward=current_transition['reward'],
                        next_state={'observation': next_state},
                        game_over=False,
                        info={
                            'all_action_probabilities':
                            ast.literal_eval(
                                current_transition['all_action_probabilities'])
                        }))

            # Set the last transition to end the episode
            if csv_dataset.is_episodic:
                episode.get_last_transition().game_over = True

            self.store_episode(episode)

        # close the progress bar
        progress_bar.update(len(episode_ids))
        progress_bar.close()

        self.shuffle_episodes()
Пример #8
0
    def learn_from_batch_off_policy(self, batch):
        # batch contains a list of episodes to learn from
        network_keys = self.ap.network_wrappers[
            'main'].input_embedders_parameters.keys()

        # get the values for the current states
        result = self.networks['main'].online_network.predict(
            batch.states(network_keys))
        current_state_values = result[0]
        self.state_values.add_sample(current_state_values)

        # the targets for the state value estimator are max(R, V) which is the same as clipping the error to > 0
        num_transitions = batch.size
        state_value_head_targets = np.maximum(
            batch.total_returns(expand_dims=True), current_state_values)

        # estimate the advantage function
        action_advantages = np.zeros((num_transitions, 1))

        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
            action_advantages = batch.total_returns(
            ) - current_state_values.squeeze()
            # clip negative advantages to get the SIL rescaler (R - V)+
            action_advantages = np.clip(action_advantages, 0, np.inf)
        else:
            screen.warning(
                "WARNING: The requested policy gradient rescaler is not available"
            )

        # extract action indices
        actions = batch.actions()
        if not isinstance(self.spaces.action,
                          DiscreteActionSpace) and len(actions.shape) < 2:
            actions = np.expand_dims(actions, -1)

        # update errors in prioritized replay buffer
        importance_weights = self.update_transition_priorities_and_get_weights(
            action_advantages, batch)

        # train
        result = self.networks['main'].train_and_sync_networks(
            {
                **batch.states(network_keys), 'output_1_0': actions
            }, [state_value_head_targets, action_advantages],
            importance_weights=importance_weights)

        # logging
        total_loss, losses, unclipped_grads = result[:3]
        self.action_advantages.add_sample(action_advantages)
        self.unclipped_grads.add_sample(unclipped_grads)
        self.value_loss.add_sample(losses[0])
        self.policy_loss.add_sample(losses[1])

        return total_loss, losses, unclipped_grads
Пример #9
0
    def fill_advantages(self, batch):
        network_keys = self.ap.network_wrappers[
            'main'].input_embedders_parameters.keys()

        current_state_values = self.networks['main'].online_network.predict(
            batch.states(network_keys))[0]
        current_state_values = current_state_values.squeeze()
        self.state_values.add_sample(current_state_values)

        # calculate advantages
        advantages = []
        value_targets = []
        total_returns = batch.n_step_discounted_rewards()

        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
            advantages = total_returns - current_state_values
        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
            # get bootstraps
            episode_start_idx = 0
            advantages = np.array([])
            value_targets = np.array([])
            for idx, game_over in enumerate(batch.game_overs()):
                if game_over:
                    # get advantages for the rollout
                    value_bootstrapping = np.zeros((1, ))
                    rollout_state_values = np.append(
                        current_state_values[episode_start_idx:idx + 1],
                        value_bootstrapping)

                    rollout_advantages, gae_based_value_targets = \
                        self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
                                                                     rollout_state_values)
                    episode_start_idx = idx + 1
                    advantages = np.append(advantages, rollout_advantages)
                    value_targets = np.append(value_targets,
                                              gae_based_value_targets)
        else:
            screen.warning(
                "WARNING: The requested policy gradient rescaler is not available"
            )

        # standardize
        advantages = (advantages - np.mean(advantages)) / np.std(advantages)

        for transition, advantage, value_target in zip(batch.transitions,
                                                       advantages,
                                                       value_targets):
            transition.info['advantage'] = advantage
            transition.info['gae_based_value_target'] = value_target

        self.action_advantages.add_sample(advantages)
Пример #10
0
 def default_preset_name(self):
     """
     Sub-classes will typically return a single hard-coded string.
     """
     try:
         #TODO: remove this after converting all samples.
         default_preset = self.DEFAULT_PRESET
         screen.warning("Deprecated configuration of default preset.  Please implement default_preset_name()")
         return default_preset
     except:
         pass
     raise NotImplementedError("Sub-classes must specify the name of the default preset "+
                               "for this RL problem.  This will be the name of a python "+
                               "file (without .py) that defines a graph_manager variable")
Пример #11
0
 def __init__(self,
              framework_type: Frameworks = Frameworks.tensorflow,
              evaluate_only: int = None,
              use_cpu: bool = False,
              experiment_path='/tmp',
              seed=None,
              checkpoint_save_secs=None,
              checkpoint_restore_dir=None,
              checkpoint_restore_path=None,
              checkpoint_save_dir=None,
              export_onnx_graph: bool = False,
              apply_stop_condition: bool = False,
              num_gpu: int = 1):
     """
     :param framework_type: deep learning framework type. currently only tensorflow is supported
     :param evaluate_only: if not None, the task will be used only for evaluating the model for the given number of steps.
                             A value of 0 means that task will be evaluated for an infinite number of steps.
     :param use_cpu: use the cpu for this task
     :param experiment_path: the path to the directory which will store all the experiment outputs
     :param seed: a seed to use for the random numbers generator
     :param checkpoint_save_secs: the number of seconds between each checkpoint saving
     :param checkpoint_restore_dir:
             [DEPECRATED - will be removed in one of the next releases - switch to checkpoint_restore_path]
             the dir to restore the checkpoints from
     :param checkpoint_restore_path: the path to restore the checkpoints from
     :param checkpoint_save_dir: the directory to store the checkpoints in
     :param export_onnx_graph: If set to True, this will export an onnx graph each time a checkpoint is saved
     :param apply_stop_condition: If set to True, this will apply the stop condition defined by reaching a target success rate
     :param num_gpu: number of GPUs to use
     """
     self.framework_type = framework_type
     self.task_index = 0  # TODO: not really needed
     self.evaluate_only = evaluate_only
     self.use_cpu = use_cpu
     self.experiment_path = experiment_path
     self.checkpoint_save_secs = checkpoint_save_secs
     if checkpoint_restore_dir:
         screen.warning(
             'TaskParameters.checkpoint_restore_dir is DEPECRATED and will be removed in one of the next '
             'releases. Please switch to using TaskParameters.checkpoint_restore_path, with your '
             'directory path. ')
         self.checkpoint_restore_path = checkpoint_restore_dir
     else:
         self.checkpoint_restore_path = checkpoint_restore_path
     self.checkpoint_save_dir = checkpoint_save_dir
     self.seed = seed
     self.export_onnx_graph = export_onnx_graph
     self.apply_stop_condition = apply_stop_condition
     self.num_gpu = num_gpu
Пример #12
0
    def _create_graph(
        self, task_parameters: TaskParameters
    ) -> Tuple[List[LevelManager], List[Environment]]:
        if self.env_params:
            # environment loading
            self.env_params.seed = task_parameters.seed
            self.env_params.experiment_path = task_parameters.experiment_path
            env = short_dynamic_import(self.env_params.path)(
                **self.env_params.__dict__,
                visualization_parameters=self.visualization_parameters)
        else:
            env = None

        # Only DQN variants and NEC are supported at this point.
        assert (isinstance(self.agent_params, DQNAgentParameters)
                or isinstance(self.agent_params, NECAgentParameters))
        # Only Episodic memories are supported,
        # for evaluating the sequential doubly robust estimator
        assert (isinstance(self.agent_params.memory,
                           EpisodicExperienceReplayParameters))

        # agent loading
        self.agent_params.task_parameters = task_parameters  # TODO: this should probably be passed in a different way
        self.agent_params.name = "agent"
        self.agent_params.is_batch_rl_training = True

        if 'reward_model' not in self.agent_params.network_wrappers:
            # user hasn't defined params for the reward model. we will use the same params as used for the 'main'
            # network.
            self.agent_params.network_wrappers['reward_model'] = deepcopy(
                self.agent_params.network_wrappers['main'])

        agent = short_dynamic_import(self.agent_params.path)(self.agent_params)

        if not env and not self.agent_params.memory.load_memory_from_file_path:
            screen.warning(
                "A BatchRLGraph requires setting a dataset to load into the agent's memory or alternatively "
                "using an environment to create a (random) dataset from. This agent should only be used for "
                "inference. ")
        # set level manager
        level_manager = LevelManager(agents=agent,
                                     environment=env,
                                     name="main_level",
                                     spaces_definition=self.spaces_definition)

        if env:
            return [level_manager], [env]
        else:
            return [level_manager], []
 def _save_onnx_model(self):
     ckpt_dir = '/opt/ml/output/data/checkpoint'
     model_dir = '/opt/ml/model'
     # find latest onnx file
     # currently done by name, expected to be changed in future release of coach.
     glob_pattern = os.path.join(ckpt_dir, '*.onnx')
     onnx_files = [file for file in glob.iglob(glob_pattern, recursive=True)]
     if len(onnx_files) > 0:
         extract_step = lambda string: int(re.search('/(\d*)_Step.*', string, re.IGNORECASE).group(1))
         onnx_files.sort(key=extract_step)
         latest_onnx_file = onnx_files[-1]
         # move to model directory
         filepath_from = os.path.abspath(latest_onnx_file)
         filepath_to = os.path.join(model_dir, "model.onnx")
         shutil.move(filepath_from, filepath_to)
     else:
         screen.warning("No ONNX files found in {}".format(ckpt_dir))
Пример #14
0
    def fill_advantages(self, batch):
        batch = Batch(batch)
        network_keys = self.ap.network_wrappers[
            'critic'].input_embedders_parameters.keys()

        # * Found not to have any impact *
        # current_states_with_timestep = self.concat_state_and_timestep(batch)

        current_state_values = self.networks['critic'].online_network.predict(
            batch.states(network_keys)).squeeze()
        total_returns = batch.n_step_discounted_rewards()
        # calculate advantages
        advantages = []
        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
            advantages = total_returns - current_state_values
        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
            # get bootstraps
            episode_start_idx = 0
            advantages = np.array([])
            # current_state_values[batch.game_overs()] = 0
            for idx, game_over in enumerate(batch.game_overs()):
                if game_over:
                    # get advantages for the rollout
                    value_bootstrapping = np.zeros((1, ))
                    rollout_state_values = np.append(
                        current_state_values[episode_start_idx:idx + 1],
                        value_bootstrapping)

                    rollout_advantages, _ = \
                        self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
                                                                     rollout_state_values)
                    episode_start_idx = idx + 1
                    advantages = np.append(advantages, rollout_advantages)
        else:
            screen.warning(
                "WARNING: The requested policy gradient rescaler is not available"
            )

        # standardize
        advantages = (advantages - np.mean(advantages)) / np.std(advantages)

        # TODO: this will be problematic with a shared memory
        for transition, advantage in zip(self.memory.transitions, advantages):
            transition.info['advantage'] = advantage

        self.action_advantages.add_sample(advantages)
Пример #15
0
    def learn_from_batch(self, batch):
        # batch contains a list of episodes to learn from
        network_keys = self.ap.network_wrappers[
            'main'].input_embedders_parameters.keys()

        total_returns = batch.total_returns()
        for i in reversed(range(batch.size)):
            if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
                total_returns[i] = total_returns[0]
            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
                # just take the total return as it is
                pass
            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
                # we can get a single transition episode while playing Doom Basic, causing the std to be 0
                if self.std_discounted_return != 0:
                    total_returns[i] = (total_returns[i] -
                                        self.mean_discounted_return
                                        ) / self.std_discounted_return
                else:
                    total_returns[i] = 0
            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
                total_returns[i] -= self.mean_return_over_multiple_episodes[i]
            else:
                screen.warning(
                    "WARNING: The requested policy gradient rescaler is not available"
                )

        targets = total_returns
        actions = batch.actions()
        if type(self.spaces.action) != DiscreteActionSpace and len(
                actions.shape) < 2:
            actions = np.expand_dims(actions, -1)

        self.returns_mean.add_sample(np.mean(total_returns))
        self.returns_variance.add_sample(np.std(total_returns))

        result = self.networks['main'].online_network.accumulate_gradients(
            {
                **batch.states(network_keys), 'output_0_0': actions
            }, targets)
        total_loss, losses, unclipped_grads = result[:3]

        return total_loss, losses, unclipped_grads
 def _save_onnx_model(self):
     from .onnx_utils import fix_onnx_model
     ckpt_dir = '/opt/ml/output/data/checkpoint'
     model_dir = '/opt/ml/model'
     # find latest onnx file
     # currently done by name, expected to be changed in future release of coach.
     glob_pattern = os.path.join(ckpt_dir, '*.onnx')
     onnx_files = [file for file in glob.iglob(glob_pattern, recursive=True)]
     if len(onnx_files) > 0:
         extract_step = lambda string: int(re.search('/(\d*)_Step.*', string, re.IGNORECASE).group(1))
         onnx_files.sort(key=extract_step)
         latest_onnx_file = onnx_files[-1]
         # move to model directory
         filepath_from = os.path.abspath(latest_onnx_file)
         filepath_to = os.path.join(model_dir, "model.onnx")
         shutil.move(filepath_from, filepath_to)
         fix_onnx_model(filepath_to)
     else:
         screen.warning("No ONNX files found in {}".format(ckpt_dir))
Пример #17
0
    def load_pickled(self, file_path: str) -> None:
        """
        Restore the replay buffer contents from a pickle file.
        The pickle file is assumed to include a list of transitions.
        :param file_path: The path to a pickle file to restore
        """
        with open(file_path, 'rb') as file:
            transitions = pickle.load(file)
            num_transitions = len(transitions)
            if num_transitions > self.max_size[1]:
                screen.warning("Warning! The number of transition to load into the replay buffer ({}) is "
                               "bigger than the max size of the replay buffer ({}). The excessive transitions will "
                               "not be stored.".format(num_transitions, self.max_size[1]))

            progress_bar = ProgressBar(num_transitions)
            for transition_idx, transition in enumerate(transitions):
                self.store(transition)

                # print progress
                if transition_idx % 100 == 0:
                    progress_bar.update(transition_idx)

            progress_bar.close()
Пример #18
0
    def get_config_args(self,
                        parser: argparse.ArgumentParser) -> argparse.Namespace:
        """
        Returns a Namespace object with all the user-specified configuration options needed to launch.
        This implementation uses argparse to take arguments from the CLI, but this can be over-ridden by
        another method that gets its configuration from elsewhere.  An equivalent method however must
        return an identically structured Namespace object, which conforms to the structure defined by
        get_argument_parser.

        This method parses the arguments that the user entered, does some basic validation, and
        modification of user-specified values in short form to be more explicit.

        :param parser: a parser object which implicitly defines the format of the Namespace that
                       is expected to be returned.
        :return: the parsed arguments as a Namespace
        """
        args = parser.parse_args()

        if args.nocolor:
            screen.set_use_colors(False)

        # if no arg is given
        if len(sys.argv) == 1:
            parser.print_help()
            sys.exit(1)

        # list available presets
        if args.list:
            self.display_all_presets_and_exit()

        # Read args from config file for distributed Coach.
        if args.distributed_coach and args.distributed_coach_run_type == RunType.ORCHESTRATOR:
            coach_config = ConfigParser({
                'image': '',
                'memory_backend': 'redispubsub',
                'data_store': 's3',
                's3_end_point': 's3.amazonaws.com',
                's3_bucket_name': '',
                's3_creds_file': ''
            })
            try:
                coach_config.read(args.distributed_coach_config_path)
                args.image = coach_config.get('coach', 'image')
                args.memory_backend = coach_config.get('coach',
                                                       'memory_backend')
                args.data_store = coach_config.get('coach', 'data_store')
                if args.data_store == 's3':
                    args.s3_end_point = coach_config.get(
                        'coach', 's3_end_point')
                    args.s3_bucket_name = coach_config.get(
                        'coach', 's3_bucket_name')
                    args.s3_creds_file = coach_config.get(
                        'coach', 's3_creds_file')
            except Error as e:
                screen.error(
                    "Error when reading distributed Coach config file: {}".
                    format(e))

            if args.image == '':
                screen.error("Image cannot be empty.")

            data_store_choices = ['s3', 'nfs']
            if args.data_store not in data_store_choices:
                screen.warning("{} data store is unsupported.".format(
                    args.data_store))
                screen.error(
                    "Supported data stores are {}.".format(data_store_choices))

            memory_backend_choices = ['redispubsub']
            if args.memory_backend not in memory_backend_choices:
                screen.warning("{} memory backend is not supported.".format(
                    args.memory_backend))
                screen.error("Supported memory backends are {}.".format(
                    memory_backend_choices))

            if args.data_store == 's3':
                if args.s3_bucket_name == '':
                    screen.error("S3 bucket name cannot be empty.")
                if args.s3_creds_file == '':
                    args.s3_creds_file = None

        if args.play and args.distributed_coach:
            screen.error("Playing is not supported in distributed Coach.")

        # replace a short preset name with the full path
        if args.preset is not None:
            args.preset = self.expand_preset(args.preset)

        # validate the checkpoints args
        if args.checkpoint_restore_dir is not None and not os.path.exists(
                args.checkpoint_restore_dir):
            screen.error(
                "The requested checkpoint folder to load from does not exist.")

        # validate the checkpoints args
        if args.checkpoint_restore_file is not None and not glob(
                args.checkpoint_restore_file + '*'):
            screen.error(
                "The requested checkpoint file to load from does not exist.")

        # no preset was given. check if the user requested to play some environment on its own
        if args.preset is None and args.play and not args.environment_type:
            screen.error(
                'When no preset is given for Coach to run, and the user requests human control over '
                'the environment, the user is expected to input the desired environment_type and level.'
                '\nAt least one of these parameters was not given.')
        elif args.preset and args.play:
            screen.error(
                "Both the --preset and the --play flags were set. These flags can not be used together. "
                "For human control, please use the --play flag together with the environment type flag (-et)"
            )
        elif args.preset is None and not args.play:
            screen.error(
                "Please choose a preset using the -p flag or use the --play flag together with choosing an "
                "environment type (-et) in order to play the game.")

        # get experiment name and path
        args.experiment_name = logger.get_experiment_name(args.experiment_name)
        args.experiment_path = logger.get_experiment_path(args.experiment_name)

        if args.play and args.num_workers > 1:
            screen.warning(
                "Playing the game as a human is only available with a single worker. "
                "The number of workers will be reduced to 1")
            args.num_workers = 1

        args.framework = Frameworks[args.framework.lower()]

        # checkpoints
        args.checkpoint_save_dir = os.path.join(
            args.experiment_path,
            'checkpoint') if args.checkpoint_save_secs is not None else None

        if args.export_onnx_graph and not args.checkpoint_save_secs:
            screen.warning(
                "Exporting ONNX graphs requires setting the --checkpoint_save_secs flag. "
                "The --export_onnx_graph will have no effect.")

        return args
Пример #19
0
from rl_coach.graph_managers.graph_manager import HumanPlayScheduleParameters, GraphManager
from rl_coach.utils import list_all_presets, short_dynamic_import, get_open_port, SharedMemoryScratchPad, get_base_dir
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.environments.environment import SingleLevelSelection
from rl_coach.memories.backend.redis import RedisPubSubMemoryBackendParameters
from rl_coach.memories.backend.memory_impl import construct_memory_params
from rl_coach.data_stores.data_store import DataStoreParameters
from rl_coach.data_stores.s3_data_store import S3DataStoreParameters
from rl_coach.data_stores.nfs_data_store import NFSDataStoreParameters
from rl_coach.data_stores.data_store_impl import get_data_store, construct_data_store_params
from rl_coach.training_worker import training_worker
from rl_coach.rollout_worker import rollout_worker

if len(set(failed_imports)) > 0:
    screen.warning(
        "Warning: failed to import the following packages - {}".format(
            ', '.join(set(failed_imports))))


def add_items_to_dict(target_dict, source_dict):
    updated_task_parameters = copy.copy(source_dict)
    updated_task_parameters.update(target_dict)
    return updated_task_parameters


def open_dashboard(experiment_path):
    """
    open X11 based dashboard in a new process (nonblocking)
    """
    dashboard_path = 'python {}/dashboard.py'.format(get_base_dir())
    cmd = "{} --experiment_dir {}".format(dashboard_path, experiment_path)
Пример #20
0
    def __init__(self,
                 level: LevelSelection,
                 frame_skip: int,
                 visualization_parameters: VisualizationParameters,
                 target_success_rate: float = 1.0,
                 additional_simulator_parameters: Dict[str, Any] = {},
                 seed: Union[None, int] = None,
                 human_control: bool = False,
                 custom_reward_threshold: Union[int, float] = None,
                 random_initialization_steps: int = 1,
                 max_over_num_frames: int = 1,
                 observation_space_type: ObservationSpaceType = None,
                 **kwargs):
        """
        :param level: (str)
            A string representing the gym level to run. This can also be a LevelSelection object.
            For example, BreakoutDeterministic-v0

        :param frame_skip: (int)
            The number of frames to skip between any two actions given by the agent. The action will be repeated
            for all the skipped frames.

        :param visualization_parameters: (VisualizationParameters)
            The parameters used for visualizing the environment, such as the render flag, storing videos etc.

        :param additional_simulator_parameters: (Dict[str, Any])
            Any additional parameters that the user can pass to the Gym environment. These parameters should be
            accepted by the __init__ function of the implemented Gym environment.

        :param seed: (int)
            A seed to use for the random number generator when running the environment.

        :param human_control: (bool)
            A flag that allows controlling the environment using the keyboard keys.

        :param custom_reward_threshold: (float)
            Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.
            If not set, this value will be taken from the Gym environment definition.

        :param random_initialization_steps: (int)
            The number of random steps that will be taken in the environment after each reset.
            This is a feature presented in the DQN paper, which improves the variability of the episodes the agent sees.

        :param max_over_num_frames: (int)
            This value will be used for merging multiple frames into a single frame by taking the maximum value for each
            of the pixels in the frame. This is particularly used in Atari games, where the frames flicker, and objects
            can be seen in one frame but disappear in the next.

        :param observation_space_type:
            This value will be used for generating observation space. Allows a custom space. Should be one of
            ObservationSpaceType. If not specified, observation space is inferred from the number of dimensions
            of the observation: 1D: Vector space, 3D: Image space if 1 or 3 channels, PlanarMaps space otherwise.
        """
        super().__init__(level, seed, frame_skip, human_control,
                         custom_reward_threshold, visualization_parameters,
                         target_success_rate)

        self.random_initialization_steps = random_initialization_steps
        self.max_over_num_frames = max_over_num_frames
        self.additional_simulator_parameters = additional_simulator_parameters

        # hide warnings
        gym.logger.set_level(40)
        """
        load and initialize environment
        environment ids can be defined in 3 ways:
        1. Native gym environments like BreakoutDeterministic-v0 for example
        2. Custom gym environments written and installed as python packages.
           This environments should have a python module with a class inheriting gym.Env, implementing the
           relevant functions (_reset, _step, _render) and defining the observation and action space
           For example: my_environment_package:MyEnvironmentClass will run an environment defined in the
           MyEnvironmentClass class
        3. Custom gym environments written as an independent module which is not installed.
           This environments should have a python module with a class inheriting gym.Env, implementing the
           relevant functions (_reset, _step, _render) and defining the observation and action space.
           For example: path_to_my_environment.sub_directory.my_module:MyEnvironmentClass will run an
           environment defined in the MyEnvironmentClass class which is located in the module in the relative path
           path_to_my_environment.sub_directory.my_module
        """
        if ':' in self.env_id:
            # custom environments
            if '/' in self.env_id or '.' in self.env_id:
                # environment in a an absolute path module written as a unix path or in a relative path module
                # written as a python import path
                env_class = short_dynamic_import(self.env_id)
            else:
                # environment in a python package
                env_class = gym.envs.registration.load(self.env_id)

            # instantiate the environment
            try:
                self.env = env_class(**self.additional_simulator_parameters)
            except:
                screen.error(
                    "Failed to instantiate Gym environment class %s with arguments %s"
                    % (env_class, self.additional_simulator_parameters),
                    crash=False)
                raise
        else:
            self.env = gym.make(self.env_id)

        # for classic control we want to use the native renderer because otherwise we will get 2 renderer windows
        environment_to_always_use_with_native_rendering = [
            'classic_control', 'mujoco', 'robotics'
        ]
        self.native_rendering = self.native_rendering or \
                                any([env in str(self.env.unwrapped.__class__)
                                     for env in environment_to_always_use_with_native_rendering])
        if self.native_rendering:
            if hasattr(self, 'renderer'):
                self.renderer.close()

        # seed
        if self.seed is not None:
            self.env.seed(self.seed)
            np.random.seed(self.seed)
            random.seed(self.seed)

        # frame skip and max between consecutive frames
        self.is_mujoco_env = 'mujoco' in str(self.env.unwrapped.__class__)
        self.is_roboschool_env = 'roboschool' in str(
            self.env.unwrapped.__class__)
        self.is_atari_env = 'Atari' in str(self.env.unwrapped.__class__)
        if self.is_atari_env:
            self.env.unwrapped.frameskip = 1  # this accesses the atari env that is wrapped with a timelimit wrapper env
            if self.env_id == "SpaceInvadersDeterministic-v4" and self.frame_skip == 4:
                screen.warning(
                    "Warning: The frame-skip for Space Invaders was automatically updated from 4 to 3. "
                    "This is following the DQN paper where it was noticed that a frame-skip of 3 makes the "
                    "laser rays disappear. To force frame-skip of 4, please use SpaceInvadersNoFrameskip-v4."
                )
                self.frame_skip = 3
            self.env = MaxOverFramesAndFrameskipEnvWrapper(
                self.env,
                frameskip=self.frame_skip,
                max_over_num_frames=self.max_over_num_frames)
        else:
            self.env.unwrapped.frameskip = self.frame_skip

        self.state_space = StateSpace({})

        # observations
        if not isinstance(self.env.observation_space, gym.spaces.dict.Dict):
            state_space = {'observation': self.env.observation_space}
        else:
            state_space = self.env.observation_space.spaces

        for observation_space_name, observation_space in state_space.items():
            if observation_space_type == ObservationSpaceType.Tensor:
                # we consider arbitrary input tensor which does not necessarily represent images
                self.state_space[
                    observation_space_name] = TensorObservationSpace(
                        shape=np.array(observation_space.shape),
                        low=observation_space.low,
                        high=observation_space.high)
            elif observation_space_type == ObservationSpaceType.Image or len(
                    observation_space.shape) == 3:
                # we assume gym has image observations (with arbitrary number of channels) where their values are
                # within 0-255, and where the channel dimension is the last dimension
                if observation_space.shape[-1] in [1, 3]:
                    self.state_space[
                        observation_space_name] = ImageObservationSpace(
                            shape=np.array(observation_space.shape),
                            high=255,
                            channels_axis=-1)
                else:
                    # For any number of channels other than 1 or 3, use the generic PlanarMaps space
                    self.state_space[
                        observation_space_name] = PlanarMapsObservationSpace(
                            shape=np.array(observation_space.shape),
                            low=0,
                            high=255,
                            channels_axis=-1)
            elif observation_space_type == ObservationSpaceType.Vector or len(
                    observation_space.shape) == 1:
                self.state_space[
                    observation_space_name] = VectorObservationSpace(
                        shape=observation_space.shape[0],
                        low=observation_space.low,
                        high=observation_space.high)
            else:
                raise screen.error(
                    "Failed to instantiate Gym environment class %s with observation space type %s"
                    % (env_class, observation_space_type),
                    crash=True)

        if 'desired_goal' in state_space.keys():
            self.goal_space = self.state_space['desired_goal']

        # actions
        if type(self.env.action_space) == gym.spaces.box.Box:
            self.action_space = BoxActionSpace(
                shape=self.env.action_space.shape,
                low=self.env.action_space.low,
                high=self.env.action_space.high)
        elif type(self.env.action_space) == gym.spaces.discrete.Discrete:
            actions_description = []
            if hasattr(self.env.unwrapped, 'get_action_meanings'):
                actions_description = self.env.unwrapped.get_action_meanings()
            self.action_space = DiscreteActionSpace(
                num_actions=self.env.action_space.n,
                descriptions=actions_description)
        else:
            raise screen.error((
                "Failed to instantiate gym environment class {} due to unsupported "
                "action space {}. Expected BoxActionSpace or DiscreteActionSpace."
            ).format(env_class, self.env.action_space),
                               crash=True)

        if self.human_control:
            # TODO: add this to the action space
            # map keyboard keys to actions
            self.key_to_action = {}
            if hasattr(self.env.unwrapped, 'get_keys_to_action'):
                self.key_to_action = self.env.unwrapped.get_keys_to_action()
            else:
                screen.error(
                    "Error: Environment {} does not support human control.".
                    format(self.env),
                    crash=True)

        # initialize the state by getting a new state from the environment
        self.reset_internal_state(True)

        # render
        if self.is_rendered:
            image = self.get_rendered_image()
            scale = 1
            if self.human_control:
                scale = 2
            if not self.native_rendering:
                self.renderer.create_screen(image.shape[1] * scale,
                                            image.shape[0] * scale)

        # the info is only updated after the first step
        self.state = self.step(self.action_space.default_action).next_state
        self.state_space['measurements'] = VectorObservationSpace(
            shape=len(self.info.keys()))

        if self.env.spec and custom_reward_threshold is None:
            self.reward_success_threshold = self.env.spec.reward_threshold
            self.reward_space = RewardSpace(
                1, reward_success_threshold=self.reward_success_threshold)

        self.target_success_rate = target_success_rate
Пример #21
0
 def __init__(self):
     super().__init__()
     screen.warning(
         "DEPRECATION WARNING: Please switch to SageMakerCoachPresetLauncher"
     )
 def __init__(self):
     super().__init__()
     screen.warning("DEPRECATION WARNING: Please switch to SageMakerCoachPresetLauncher")
Пример #23
0
    def __init__(self,
                 level: LevelSelection,
                 frame_skip: int,
                 visualization_parameters: VisualizationParameters,
                 additional_simulator_parameters: Dict[str, Any] = None,
                 seed: Union[None, int] = None,
                 human_control: bool = False,
                 custom_reward_threshold: Union[int, float] = None,
                 random_initialization_steps: int = 1,
                 max_over_num_frames: int = 1,
                 **kwargs):
        super().__init__(level, seed, frame_skip, human_control,
                         custom_reward_threshold, visualization_parameters)

        self.random_initialization_steps = random_initialization_steps
        self.max_over_num_frames = max_over_num_frames
        self.additional_simulator_parameters = additional_simulator_parameters

        # hide warnings
        gym.logger.set_level(40)
        """
        load and initialize environment
        environment ids can be defined in 3 ways:
        1. Native gym environments like BreakoutDeterministic-v0 for example
        2. Custom gym environments written and installed as python packages.
           This environments should have a python module with a class inheriting gym.Env, implementing the
           relevant functions (_reset, _step, _render) and defining the observation and action space
           For example: my_environment_package:MyEnvironmentClass will run an environment defined in the
           MyEnvironmentClass class
        3. Custom gym environments written as an independent module which is not installed.
           This environments should have a python module with a class inheriting gym.Env, implementing the
           relevant functions (_reset, _step, _render) and defining the observation and action space.
           For example: path_to_my_environment.sub_directory.my_module:MyEnvironmentClass will run an
           environment defined in the MyEnvironmentClass class which is located in the module in the relative path
           path_to_my_environment.sub_directory.my_module
        """
        if ':' in self.env_id:
            # custom environments
            if '/' in self.env_id or '.' in self.env_id:
                # environment in a an absolute path module written as a unix path or in a relative path module
                # written as a python import path
                env_class = short_dynamic_import(self.env_id)
            else:
                # environment in a python package
                env_class = gym.envs.registration.load(self.env_id)

            # instantiate the environment
            if self.additional_simulator_parameters:
                self.env = env_class(**self.additional_simulator_parameters)
            else:
                self.env = env_class()
        else:
            self.env = gym.make(self.env_id)

        # for classic control we want to use the native renderer because otherwise we will get 2 renderer windows
        environment_to_always_use_with_native_rendering = [
            'classic_control', 'mujoco', 'robotics'
        ]
        self.native_rendering = self.native_rendering or \
                                any([env in str(self.env.unwrapped.__class__)
                                     for env in environment_to_always_use_with_native_rendering])
        if self.native_rendering:
            if hasattr(self, 'renderer'):
                self.renderer.close()

        # seed
        if self.seed is not None:
            self.env.seed(self.seed)
            np.random.seed(self.seed)
            random.seed(self.seed)

        # frame skip and max between consecutive frames
        self.is_robotics_env = 'robotics' in str(self.env.unwrapped.__class__)
        self.is_mujoco_env = 'mujoco' in str(self.env.unwrapped.__class__)
        self.is_atari_env = 'Atari' in str(self.env.unwrapped.__class__)
        self.timelimit_env_wrapper = self.env
        if self.is_atari_env:
            self.env.unwrapped.frameskip = 1  # this accesses the atari env that is wrapped with a timelimit wrapper env
            if self.env_id == "SpaceInvadersDeterministic-v4" and self.frame_skip == 4:
                screen.warning(
                    "Warning: The frame-skip for Space Invaders was automatically updated from 4 to 3. "
                    "This is following the DQN paper where it was noticed that a frame-skip of 3 makes the "
                    "laser rays disappear. To force frame-skip of 4, please use SpaceInvadersNoFrameskip-v4."
                )
                self.frame_skip = 3
            self.env = MaxOverFramesAndFrameskipEnvWrapper(
                self.env,
                frameskip=self.frame_skip,
                max_over_num_frames=self.max_over_num_frames)
        else:
            self.env.unwrapped.frameskip = self.frame_skip

        self.state_space = StateSpace({})

        # observations
        if not isinstance(self.env.observation_space,
                          gym.spaces.dict_space.Dict):
            state_space = {'observation': self.env.observation_space}
        else:
            state_space = self.env.observation_space.spaces

        for observation_space_name, observation_space in state_space.items():
            if len(observation_space.shape
                   ) == 3 and observation_space.shape[-1] == 3:
                # we assume gym has image observations which are RGB and where their values are within 0-255
                self.state_space[
                    observation_space_name] = ImageObservationSpace(
                        shape=np.array(observation_space.shape),
                        high=255,
                        channels_axis=-1)
            else:
                self.state_space[
                    observation_space_name] = VectorObservationSpace(
                        shape=observation_space.shape[0],
                        low=observation_space.low,
                        high=observation_space.high)
        if 'desired_goal' in state_space.keys():
            self.goal_space = self.state_space['desired_goal']

        # actions
        if type(self.env.action_space) == gym.spaces.box.Box:
            self.action_space = BoxActionSpace(
                shape=self.env.action_space.shape,
                low=self.env.action_space.low,
                high=self.env.action_space.high)
        elif type(self.env.action_space) == gym.spaces.discrete.Discrete:
            actions_description = []
            if hasattr(self.env.unwrapped, 'get_action_meanings'):
                actions_description = self.env.unwrapped.get_action_meanings()
            self.action_space = DiscreteActionSpace(
                num_actions=self.env.action_space.n,
                descriptions=actions_description)

        if self.human_control:
            # TODO: add this to the action space
            # map keyboard keys to actions
            self.key_to_action = {}
            if hasattr(self.env.unwrapped, 'get_keys_to_action'):
                self.key_to_action = self.env.unwrapped.get_keys_to_action()

        # initialize the state by getting a new state from the environment
        self.reset_internal_state(True)

        # render
        if self.is_rendered:
            image = self.get_rendered_image()
            scale = 1
            if self.human_control:
                scale = 2
            if not self.native_rendering:
                self.renderer.create_screen(image.shape[1] * scale,
                                            image.shape[0] * scale)

        # measurements
        if self.env.spec is not None:
            self.timestep_limit = self.env.spec.timestep_limit
        else:
            self.timestep_limit = None

        # the info is only updated after the first step
        self.state = self.step(self.action_space.default_action).next_state
        self.state_space['measurements'] = VectorObservationSpace(
            shape=len(self.info.keys()))

        if self.env.spec and custom_reward_threshold is None:
            self.reward_success_threshold = self.env.spec.reward_threshold
            self.reward_space = RewardSpace(
                1, reward_success_threshold=self.reward_success_threshold)
Пример #24
0
    def load_csv(self, csv_dataset: CsvDataset,
                 input_filter: InputFilter) -> None:
        """
        Restore the replay buffer contents from a csv file.
        The csv file is assumed to include a list of transitions.
        :param csv_dataset: A construct which holds the dataset parameters
        :param input_filter: A filter used to filter the CSV data before feeding it to the memory.
        """
        self.assert_not_frozen()

        df = pd.read_csv(csv_dataset.filepath)
        if len(df) > self.max_size[1]:
            screen.warning(
                "Warning! The number of transitions to load into the replay buffer ({}) is "
                "bigger than the max size of the replay buffer ({}). The excessive transitions will "
                "not be stored.".format(len(df), self.max_size[1]))

        episode_ids = df["episode_id"].unique()
        progress_bar = ProgressBar(len(episode_ids))
        state_columns = [
            col for col in df.columns if col.startswith("state_feature")
        ]

        for e_id in episode_ids:
            progress_bar.update(e_id)
            df_episode_transitions = df[df["episode_id"] == e_id]
            input_filter.reset()

            if len(df_episode_transitions) < 2:
                # we have to have at least 2 rows in each episode for creating a transition
                continue

            episode = Episode()
            transitions = []
            for (_, current_transition), (_, next_transition) in zip(
                    df_episode_transitions[:-1].iterrows(),
                    df_episode_transitions[1:].iterrows()):
                state = np.array(
                    [current_transition[col] for col in state_columns])
                next_state = np.array(
                    [next_transition[col] for col in state_columns])

                transitions.append(
                    Transition(
                        state={"observation": state},
                        action=int(current_transition["action"]),
                        reward=current_transition["reward"],
                        next_state={"observation": next_state},
                        game_over=False,
                        info={
                            "all_action_probabilities":
                            ast.literal_eval(
                                current_transition["all_action_probabilities"])
                        },
                    ), )

            transitions = input_filter.filter(transitions, deep_copy=False)
            for t in transitions:
                episode.insert(t)

            # Set the last transition to end the episode
            if csv_dataset.is_episodic:
                episode.get_last_transition().game_over = True

            self.store_episode(episode)

        # close the progress bar
        progress_bar.update(len(episode_ids))
        progress_bar.close()
Пример #25
0
def parse_arguments(parser: argparse.ArgumentParser) -> argparse.Namespace:
    """
    Parse the arguments that the user entered
    :param parser: the argparse command line parser
    :return: the parsed arguments
    """
    args = parser.parse_args()

    # if no arg is given
    if len(sys.argv) == 1:
        parser.print_help()
        exit(0)

    # list available presets
    preset_names = list_all_presets()
    if args.list:
        screen.log_title("Available Presets:")
        for preset in sorted(preset_names):
            print(preset)
        sys.exit(0)

    # replace a short preset name with the full path
    if args.preset is not None:
        if args.preset.lower() in [p.lower() for p in preset_names]:
            args.preset = "{}.py:graph_manager".format(
                os.path.join(get_base_dir(), 'presets', args.preset))
        else:
            args.preset = "{}".format(args.preset)
            # if a graph manager variable was not specified, try the default of :graph_manager
            if len(args.preset.split(":")) == 1:
                args.preset += ":graph_manager"

        # verify that the preset exists
        preset_path = args.preset.split(":")[0]
        if not os.path.exists(preset_path):
            screen.error("The given preset ({}) cannot be found.".format(
                args.preset))

        # verify that the preset can be instantiated
        try:
            short_dynamic_import(args.preset, ignore_module_case=True)
        except TypeError as e:
            traceback.print_exc()
            screen.error('Internal Error: ' + str(e) +
                         "\n\nThe given preset ({}) cannot be instantiated.".
                         format(args.preset))

    # validate the checkpoints args
    if args.checkpoint_restore_dir is not None and not os.path.exists(
            args.checkpoint_restore_dir):
        screen.error(
            "The requested checkpoint folder to load from does not exist.")

    # no preset was given. check if the user requested to play some environment on its own
    if args.preset is None and args.play:
        if args.environment_type:
            args.agent_type = 'Human'
        else:
            screen.error(
                'When no preset is given for Coach to run, and the user requests human control over '
                'the environment, the user is expected to input the desired environment_type and level.'
                '\nAt least one of these parameters was not given.')
    elif args.preset and args.play:
        screen.error(
            "Both the --preset and the --play flags were set. These flags can not be used together. "
            "For human control, please use the --play flag together with the environment type flag (-et)"
        )
    elif args.preset is None and not args.play:
        screen.error(
            "Please choose a preset using the -p flag or use the --play flag together with choosing an "
            "environment type (-et) in order to play the game.")

    # get experiment name and path
    args.experiment_name = logger.get_experiment_name(args.experiment_name)
    args.experiment_path = logger.get_experiment_path(args.experiment_name)

    if args.play and args.num_workers > 1:
        screen.warning(
            "Playing the game as a human is only available with a single worker. "
            "The number of workers will be reduced to 1")
        args.num_workers = 1

    args.framework = Frameworks[args.framework.lower()]

    # checkpoints
    args.save_checkpoint_dir = os.path.join(
        args.experiment_path,
        'checkpoint') if args.save_checkpoint_secs is not None else None

    return args