def __init__( self,network:Layer,env:gym.Env,memory_length:int=1000,name=None) -> None:

        self.env=env
        self.env.reset()
        self.observation_space = env.observation_space
        self.action_space = env.action_space
        self.network=network
        #self.agent_id = self.uuid
        self.memory=ReplayBuffer(memory_length)
        super().__init__(inputs=self.get_observation().repeat(2,0),output=deepcopy(network))
        self.name = name
Пример #2
0
    def __init__(self,
                 network: Layer,
                 env: gym.Env,
                 action_strategy=None,
                 gamma=0.99,
                 use_experience_replay=False,
                 replay_unit='step',
                 memory_length: int = 10000,
                 name=None) -> None:
        super().__init__()
        self.network = network
        if name is not None:
            self.network._name = name

        self.env = modify_env(env)
        self.env.reset()
        self.observation_space = env.observation_space
        self.action_space = env.action_space
        self.agent_id = uuid.uuid4().node
        self.gamma = gamma
        self.use_experience_replay = use_experience_replay
        self.memory = None
        if self.use_experience_replay:
            self.memory = ReplayBuffer(memory_length)
        else:
            self.memory = ReplayBuffer(1)

        if replay_unit not in ['step', 'episode']:
            raise ValueError('Only [step,episode] are valid unit options.')
        self.replay_unit = replay_unit
        self.name = name
        self.action_strategy = action_strategy
        self.state_pool = []
        self.reward_pool = []
        self.action_pool = []
        self.value_pool = []
        self.setting_network()
Пример #3
0
 def __init__(self,
              network: Layer,
              env: gym.Env,
              memory_length: int = 1000,
              name=None) -> None:
     super().__init__()
     self.network = network
     if name is not None:
         self.network._name = name
     self.env = env
     self.env.reset()
     self.observation_space = env.observation_space
     self.action_space = env.action_space
     # self.agent_id = self.uuid
     self.memory = ReplayBuffer(memory_length)
     self.name = name
     self.setting_network()
Пример #4
0
    def __init__(self,
                 network: Layer,
                 env: gym.Env,
                 memory_length: int = 1000,
                 name=None) -> None:
        self.network = network
        self.network.to(get_device())
        if name is not None:
            self.network._name = name

        self.env = env
        self.env.reset()
        self.observation_space = env.observation_space
        self.action_space = env.action_space
        super().__init__(inputs=to_tensor(
            self.get_observation()).repeat_elements(2, 0).to(get_device()),
                         output=deepcopy(self.network))
        self.setting_network()

        self.memory = ReplayBuffer(memory_length)
        self.name = name
Пример #5
0
class PolicyBase(Model):
    """The base class for any RL policy.
    """
    def __init__(self,
                 network: Layer,
                 env: gym.Env,
                 action_strategy=None,
                 gamma=0.99,
                 use_experience_replay=False,
                 replay_unit='step',
                 memory_length: int = 10000,
                 name=None) -> None:
        super().__init__()
        self.network = network
        if name is not None:
            self.network._name = name

        self.env = modify_env(env)
        self.env.reset()
        self.observation_space = env.observation_space
        self.action_space = env.action_space
        self.agent_id = uuid.uuid4().node
        self.gamma = gamma
        self.use_experience_replay = use_experience_replay
        self.memory = None
        if self.use_experience_replay:
            self.memory = ReplayBuffer(memory_length)
        else:
            self.memory = ReplayBuffer(1)

        if replay_unit not in ['step', 'episode']:
            raise ValueError('Only [step,episode] are valid unit options.')
        self.replay_unit = replay_unit
        self.name = name
        self.action_strategy = action_strategy
        self.state_pool = []
        self.reward_pool = []
        self.action_pool = []
        self.value_pool = []
        self.setting_network()

    def setting_network(self):
        super()._initial_graph(inputs=self.get_observation(),
                               output=deepcopy(self.network))

    def get_observation(self):
        if hasattr(self.env, 'state'):
            return expand_dims(self.data_preprocess(self.env.state), 0)
        else:
            return expand_dims(
                self.data_preprocess(self.env.render('observation')), 0)

    def select_action(self, state, model_only=False, **kwargs):
        return self.env.action_space.samples()

    def get_rewards(self, action):
        return self.env.step(action)

    def experience_replay(self, batch_size):
        return NotImplemented

    def collect_samples(self, min_replay_samples, need_render=False) -> bool:

        if self.memory is None:
            self.memory = ReplayBuffer(10000)
        progress_inteval = int(min_replay_samples / 50) * 5
        self.state_pool = []
        self.reward_pool = []
        self.action_pool = []

        for i_episode in range(min_replay_samples):
            self.env.reset()
            state = self.get_observation()
            for t in count():
                action = self.select_action(
                    state,
                    model_only=True if self.action_strategy
                    == ActionStrategy.OnPolicy else False)
                _observation, reward, done, info = self.get_rewards(action)
                if need_render:
                    self.env.render()
                next_state = None
                if not done:
                    next_state = self.get_observation()
                if self.replay_unit == 'step':
                    self.memory.push(state, action, next_state, reward)
                    if len(self.memory) < min_replay_samples and len(
                            self.memory) % progress_inteval == 0:
                        print("Replay Samples:{0}".format(len(self.memory)))
                    if len(self.memory) == min_replay_samples:
                        # n1 = self.action_logs['model'][0]
                        # n2 = self.action_logs['model'][1]
                        # n3 = self.action_logs['random'][0]
                        # n4 = self.action_logs['random'][1]
                        # print('model: 0:{0} 1:{1}  random: 0:{2} 1:{3}  random: {4}'.format(float(n1) / (n1 + n2), float(n2) / (n1 + n2), float(n3) / (n3 + n4),
                        #                                                                                       float(n4) / (n3 + n4), float(n3 + n4) / builtins.max(n1 + n2 + n3 + n4,1)))
                        #
                        # self.action_logs = OrderedDict()
                        # self.action_logs['model'] = OrderedDict()
                        # self.action_logs['random'] = OrderedDict()
                        # self.action_logs['model'][0] = 0
                        # self.action_logs['model'][1] = 0
                        # self.action_logs['random'][0] = 0
                        # self.action_logs['random'][1] = 0
                        return True
                elif self.replay_unit == 'episode':
                    self.state_pool.append(state)
                    self.action_pool.append(action)
                    self.reward_pool.append(reward)
                    if done:
                        self.memory.push(self.state_pool, self.action_pool,
                                         None, self.reward_pool)
                        if len(self.memory) < min_replay_samples and len(
                                self.memory) % progress_inteval == 0:
                            print("Replay Samples:{0}".format(len(
                                self.memory)))
                        self.state_pool = []
                        self.action_pool = []
                        self.reward_pool = []

                        if len(self.memory) == min_replay_samples:
                            return True
                        break
                state = next_state
                if done:
                    break

        return False

    def push_into_memory_criteria(self, *args, **kwargs) -> bool:
        return True

    def episode_complete_criteria(self, *args, **kwargs) -> bool:
        return False

    def task_complete_criteria(self, *args, **kwargs) -> bool:
        return False

    def estimate_future_return(self, *args, **kwargs):
        return NotImplemented

    def save_or_sync_weights(self):
        self.save_model(save_path=self.training_context['save_path'])

    def training_model(self,
                       current_episode=0,
                       current_step=0,
                       num_episodes=100,
                       train_timing=None,
                       done=False,
                       batch_size=1,
                       repeat_train=1):

        is_collect_data = False
        for i in range(repeat_train):
            data = None
            if self.use_experience_replay:
                data = self.experience_replay(batch_size)
            else:
                data = self.memory.memory[0]
            self.estimate_future_return(*data)
            self.training_context['skip_generate_output'] = True
            if 'step' in train_timing:
                current_step = current_step * repeat_train + i
                if done:
                    total_batch = current_step * repeat_train + i + 1
                    is_collect_data = True
                else:
                    total_batch = current_step * repeat_train + i + 10
            elif 'episode' in train_timing:
                current_step = i
                total_batch = repeat_train

            super(PolicyBase, self).train_model(
                self.training_context['train_data'],
                self.training_context['test_data'],
                current_epoch=current_episode,
                current_batch=current_step,
                total_epoch=num_episodes,
                total_batch=total_batch,
                is_collect_data=True,
                is_print_batch_progress=False,
                is_print_epoch_progress=False,
                log_gradients=False,
                log_weights=False,
                accumulate_grads=(current_step * repeat_train + 1) %
                self.accumulation_steps != 0)
            self.save_or_sync_weights()

    def play(self,
             num_episodes,
             batch_size=1,
             min_replay_samples=1,
             print_progess_frequency=5,
             training=True,
             train_timing='on_episode_start',
             train_every_nstep=1,
             repeat_train=1,
             need_render=True):
        if train_timing not in [
                'on_episode_start', 'on_step_end', 'on_step_start'
        ]:
            raise ValueError(
                'Only on_episode_start,on_step_end are valid  train_timing options'
            )

        if training:
            self._model.train()
        else:
            self._model.eval()
        if self.use_experience_replay:
            self.collect_samples(min_replay_samples=min_replay_samples)
        else:
            self.collect_samples(
                min_replay_samples=1,
                need_render=True if self.replay_unit == 'episode' else False)
            print('start train....')
        self.state_pool = []
        self.reward_pool = []
        self.action_pool = []

        self.total_reward = 0
        self.t = 0
        self.i_episode = 0
        if hasattr(self.env, 'recording_enabled'):
            self.env.recording_enabled = True
        for i_episode in range(num_episodes):
            self.i_episode = i_episode

            if training and train_timing == 'on_episode_start' and i_episode % train_every_nstep == 0:
                self.training_model(i_episode,
                                    0,
                                    num_episodes=num_episodes,
                                    repeat_train=repeat_train,
                                    train_timing=train_timing,
                                    batch_size=batch_size)
            self.env.reset()
            self.total_rewards = 0
            state = self.get_observation()
            for t in count():
                self.t = t
                # # Train on_step_start
                # if training and train_timing == 'on_step_start' and t % train_every_nstep == 0:
                #     self.training_model(i_episode, t,num_episodes=num_episodes, repeat_train=repeat_train, batch_size=batch_size)

                action = self.select_action(state, model_only=True)
                observation, reward, done, info = self.get_rewards(action)

                self.total_rewards += reward

                next_state = self.get_observation() if not done else None

                if need_render:
                    self.env.render()
                if self.replay_unit == 'step':
                    if self.push_into_memory_criteria(
                            state, action, next_state, reward) or done:
                        self.memory.push(state, action, next_state, reward)
                elif self.replay_unit == 'episode':
                    self.state_pool.append(state)
                    self.action_pool.append(action)
                    self.reward_pool.append(reward)
                    if done:
                        if self.push_into_memory_criteria(
                                self.state_pool, self.action_pool, None,
                                self.reward_pool):
                            self.memory.push(self.state_pool, self.action_pool,
                                             None, self.reward_pool)
                        self.state_pool = []
                        self.action_pool = []
                        self.reward_pool = []

                complete = self.episode_complete_criteria()
                # Train on_step_end
                if training and train_timing == 'on_step_end' and t % train_every_nstep == 0:
                    self.training_model(i_episode,
                                        t,
                                        num_episodes=num_episodes,
                                        done=done or complete,
                                        repeat_train=repeat_train,
                                        train_timing=train_timing,
                                        batch_size=batch_size,
                                        accumulate_grads=accumulate_grads)

                state = next_state
                if done or complete:
                    self.epoch_metric_history.collect(
                        'rewards', i_episode, float(self.total_rewards))
                    self.epoch_metric_history.collect('t', i_episode,
                                                      float(t + 1))
                    if self.use_experience_replay:
                        self.epoch_metric_history.collect(
                            'replay_buffer_utility', i_episode,
                            float(len(self.memory)) / self.memory.capacity)

                    if print_progess_frequency == 1 or (
                            i_episode > 0 and
                        (i_episode + 1) % print_progess_frequency == 0):
                        self.print_epoch_progress(print_progess_frequency)
                        # n1 = self.action_logs['model'][0]
                        # n2 = self.action_logs['model'][1]
                        # n3 = self.action_logs['random'][0]
                        # n4 = self.action_logs['random'][1]
                        # print('model: 0:{0} 1:{1}  random: 0:{2} 1:{3}  random: {4}'.format(float(n1) / (n1 + n2), float(n2) / (n1 + n2), float(n3) / builtins.max(n3 + n4,1),
                        #                                                                                       float(n4) / builtins.max(n3 + n4,1), float(n3 + n4) / builtins.max(n1 + n2 + n3 + n4,1)))
                        #
                        # self.action_logs = OrderedDict()
                        # self.action_logs['model'] = OrderedDict()
                        # self.action_logs['random'] = OrderedDict()
                        # self.action_logs['model'][0] = 0
                        # self.action_logs['model'][1] = 0
                        # self.action_logs['random'][0] = 0
                        # self.action_logs['random'][1] = 0
                    # 定期繪製損失函數以及評估函數對時間的趨勢圖
                    if i_episode > 0 and (i_episode + 1) % (
                            5 * print_progess_frequency) == 0:
                        loss_metric_curve(
                            self.epoch_loss_history,
                            self.epoch_metric_history,
                            metrics_names=list(
                                self.epoch_metric_history.keys()),
                            calculate_base='epoch',
                            imshow=True)

                    if self.task_complete_criteria():
                        self.save_model(
                            save_path=self.training_context['save_path'])
                        print(
                            'episode {0} meet task complete criteria, training finish! '
                            .format(i_episode))
                        return True

                    break

        print('Complete')
        self.env.render()
        self.env.close()

    def learn(self,
              num_episodes,
              batch_size=1,
              min_replay_samples=1,
              print_progess_frequency=5,
              train_timing='on_episode_start',
              train_every_nstep=1,
              repeat_train=1,
              accumulate_grads=False):
        self.play(num_episodes=num_episodes,
                  batch_size=batch_size,
                  min_replay_samples=min_replay_samples,
                  print_progess_frequency=print_progess_frequency,
                  training=True,
                  train_timing=train_timing,
                  train_every_nstep=train_every_nstep,
                  repeat_train=repeat_train,
                  need_render=True)

    def resume(self, num_episodes=3000, **kwargs):
        pass

    @property
    def preprocess_flow(self):
        return self._preprocess_flow

    @preprocess_flow.setter
    def preprocess_flow(self, value):
        self._preprocess_flow = value
        objecttype = None
        if isinstance(self.model.input_spec, TensorSpec):
            objecttype = self.model.input_spec.object_type
        # super()._initial_graph(inputs=to_tensor(self.get_observation()).repeat_elements(2, 0), output=deepcopy(self.network))
        self.setting_network()
        if objecttype is not None:
            self.inputs.value_list[0].object_type = objecttype
            self.model.input_spec.object_type = objecttype

        self.env.reset()

    def data_preprocess(self, img_data):
        if self._model is not None:
            self._model.input_spec.object_type = ObjectType.rgb
        if not hasattr(self,
                       '_preprocess_flow') or self._preprocess_flow is None:
            self._preprocess_flow = []
        if img_data.ndim == 4:
            return to_tensor(
                to_numpy([self.data_preprocess(im) for im in img_data]))
        if len(self._preprocess_flow) == 0:
            return image_backend_adaption(img_data)
        if isinstance(img_data, np.ndarray):
            for fc in self._preprocess_flow:
                if self._model is not None and self.signature is not None and len(
                        self.signature
                ) > 1 and self._model.input_spec is not None:
                    img_data = fc(img_data, spec=self._model.input_spec)
                else:
                    img_data = fc(img_data)
            img_data = image_backend_adaption(img_data)
            if self._model.input_spec is None:
                self._model.input_spec = TensorSpec(shape=tensor_to_shape(
                    to_tensor(img_data),
                    need_exclude_batch_axis=True,
                    is_singleton=True),
                                                    object_type=ObjectType.rgb,
                                                    name='input')

                self.input_shape = self._model.input_spec.shape[1:]

            return img_data
        else:
            return img_data

    def do_on_batch_end(self):
        self.training_context['time_batch_progress'] += (
            time.time() - self.training_context['time_batch_start'])
        self.training_context['time_epoch_progress'] += (
            time.time() - self.training_context['time_batch_start'])
        self.training_context['steps'] += 1
        if (self.training_context['steps'] +
                1) % _session.epoch_equivalent == 0:
            if self.warmup > 0 and self.warmup == (
                    self.training_context['steps'] +
                    1) // _session.epoch_equivalent:
                self.adjust_learning_rate(self.training_context['base_lr'])
                self.warmup = 0
Пример #6
0
    def collect_samples(self, min_replay_samples, need_render=False) -> bool:

        if self.memory is None:
            self.memory = ReplayBuffer(10000)
        progress_inteval = int(min_replay_samples / 50) * 5
        self.state_pool = []
        self.reward_pool = []
        self.action_pool = []

        for i_episode in range(min_replay_samples):
            self.env.reset()
            state = self.get_observation()
            for t in count():
                action = self.select_action(
                    state,
                    model_only=True if self.action_strategy
                    == ActionStrategy.OnPolicy else False)
                _observation, reward, done, info = self.get_rewards(action)
                if need_render:
                    self.env.render()
                next_state = None
                if not done:
                    next_state = self.get_observation()
                if self.replay_unit == 'step':
                    self.memory.push(state, action, next_state, reward)
                    if len(self.memory) < min_replay_samples and len(
                            self.memory) % progress_inteval == 0:
                        print("Replay Samples:{0}".format(len(self.memory)))
                    if len(self.memory) == min_replay_samples:
                        # n1 = self.action_logs['model'][0]
                        # n2 = self.action_logs['model'][1]
                        # n3 = self.action_logs['random'][0]
                        # n4 = self.action_logs['random'][1]
                        # print('model: 0:{0} 1:{1}  random: 0:{2} 1:{3}  random: {4}'.format(float(n1) / (n1 + n2), float(n2) / (n1 + n2), float(n3) / (n3 + n4),
                        #                                                                                       float(n4) / (n3 + n4), float(n3 + n4) / builtins.max(n1 + n2 + n3 + n4,1)))
                        #
                        # self.action_logs = OrderedDict()
                        # self.action_logs['model'] = OrderedDict()
                        # self.action_logs['random'] = OrderedDict()
                        # self.action_logs['model'][0] = 0
                        # self.action_logs['model'][1] = 0
                        # self.action_logs['random'][0] = 0
                        # self.action_logs['random'][1] = 0
                        return True
                elif self.replay_unit == 'episode':
                    self.state_pool.append(state)
                    self.action_pool.append(action)
                    self.reward_pool.append(reward)
                    if done:
                        self.memory.push(self.state_pool, self.action_pool,
                                         None, self.reward_pool)
                        if len(self.memory) < min_replay_samples and len(
                                self.memory) % progress_inteval == 0:
                            print("Replay Samples:{0}".format(len(
                                self.memory)))
                        self.state_pool = []
                        self.action_pool = []
                        self.reward_pool = []

                        if len(self.memory) == min_replay_samples:
                            return True
                        break
                state = next_state
                if done:
                    break

        return False