def __init__( self,network:Layer,env:gym.Env,memory_length:int=1000,name=None) -> None: self.env=env self.env.reset() self.observation_space = env.observation_space self.action_space = env.action_space self.network=network #self.agent_id = self.uuid self.memory=ReplayBuffer(memory_length) super().__init__(inputs=self.get_observation().repeat(2,0),output=deepcopy(network)) self.name = name
def __init__(self, network: Layer, env: gym.Env, action_strategy=None, gamma=0.99, use_experience_replay=False, replay_unit='step', memory_length: int = 10000, name=None) -> None: super().__init__() self.network = network if name is not None: self.network._name = name self.env = modify_env(env) self.env.reset() self.observation_space = env.observation_space self.action_space = env.action_space self.agent_id = uuid.uuid4().node self.gamma = gamma self.use_experience_replay = use_experience_replay self.memory = None if self.use_experience_replay: self.memory = ReplayBuffer(memory_length) else: self.memory = ReplayBuffer(1) if replay_unit not in ['step', 'episode']: raise ValueError('Only [step,episode] are valid unit options.') self.replay_unit = replay_unit self.name = name self.action_strategy = action_strategy self.state_pool = [] self.reward_pool = [] self.action_pool = [] self.value_pool = [] self.setting_network()
def __init__(self, network: Layer, env: gym.Env, memory_length: int = 1000, name=None) -> None: super().__init__() self.network = network if name is not None: self.network._name = name self.env = env self.env.reset() self.observation_space = env.observation_space self.action_space = env.action_space # self.agent_id = self.uuid self.memory = ReplayBuffer(memory_length) self.name = name self.setting_network()
def __init__(self, network: Layer, env: gym.Env, memory_length: int = 1000, name=None) -> None: self.network = network self.network.to(get_device()) if name is not None: self.network._name = name self.env = env self.env.reset() self.observation_space = env.observation_space self.action_space = env.action_space super().__init__(inputs=to_tensor( self.get_observation()).repeat_elements(2, 0).to(get_device()), output=deepcopy(self.network)) self.setting_network() self.memory = ReplayBuffer(memory_length) self.name = name
class PolicyBase(Model): """The base class for any RL policy. """ def __init__(self, network: Layer, env: gym.Env, action_strategy=None, gamma=0.99, use_experience_replay=False, replay_unit='step', memory_length: int = 10000, name=None) -> None: super().__init__() self.network = network if name is not None: self.network._name = name self.env = modify_env(env) self.env.reset() self.observation_space = env.observation_space self.action_space = env.action_space self.agent_id = uuid.uuid4().node self.gamma = gamma self.use_experience_replay = use_experience_replay self.memory = None if self.use_experience_replay: self.memory = ReplayBuffer(memory_length) else: self.memory = ReplayBuffer(1) if replay_unit not in ['step', 'episode']: raise ValueError('Only [step,episode] are valid unit options.') self.replay_unit = replay_unit self.name = name self.action_strategy = action_strategy self.state_pool = [] self.reward_pool = [] self.action_pool = [] self.value_pool = [] self.setting_network() def setting_network(self): super()._initial_graph(inputs=self.get_observation(), output=deepcopy(self.network)) def get_observation(self): if hasattr(self.env, 'state'): return expand_dims(self.data_preprocess(self.env.state), 0) else: return expand_dims( self.data_preprocess(self.env.render('observation')), 0) def select_action(self, state, model_only=False, **kwargs): return self.env.action_space.samples() def get_rewards(self, action): return self.env.step(action) def experience_replay(self, batch_size): return NotImplemented def collect_samples(self, min_replay_samples, need_render=False) -> bool: if self.memory is None: self.memory = ReplayBuffer(10000) progress_inteval = int(min_replay_samples / 50) * 5 self.state_pool = [] self.reward_pool = [] self.action_pool = [] for i_episode in range(min_replay_samples): self.env.reset() state = self.get_observation() for t in count(): action = self.select_action( state, model_only=True if self.action_strategy == ActionStrategy.OnPolicy else False) _observation, reward, done, info = self.get_rewards(action) if need_render: self.env.render() next_state = None if not done: next_state = self.get_observation() if self.replay_unit == 'step': self.memory.push(state, action, next_state, reward) if len(self.memory) < min_replay_samples and len( self.memory) % progress_inteval == 0: print("Replay Samples:{0}".format(len(self.memory))) if len(self.memory) == min_replay_samples: # n1 = self.action_logs['model'][0] # n2 = self.action_logs['model'][1] # n3 = self.action_logs['random'][0] # n4 = self.action_logs['random'][1] # print('model: 0:{0} 1:{1} random: 0:{2} 1:{3} random: {4}'.format(float(n1) / (n1 + n2), float(n2) / (n1 + n2), float(n3) / (n3 + n4), # float(n4) / (n3 + n4), float(n3 + n4) / builtins.max(n1 + n2 + n3 + n4,1))) # # self.action_logs = OrderedDict() # self.action_logs['model'] = OrderedDict() # self.action_logs['random'] = OrderedDict() # self.action_logs['model'][0] = 0 # self.action_logs['model'][1] = 0 # self.action_logs['random'][0] = 0 # self.action_logs['random'][1] = 0 return True elif self.replay_unit == 'episode': self.state_pool.append(state) self.action_pool.append(action) self.reward_pool.append(reward) if done: self.memory.push(self.state_pool, self.action_pool, None, self.reward_pool) if len(self.memory) < min_replay_samples and len( self.memory) % progress_inteval == 0: print("Replay Samples:{0}".format(len( self.memory))) self.state_pool = [] self.action_pool = [] self.reward_pool = [] if len(self.memory) == min_replay_samples: return True break state = next_state if done: break return False def push_into_memory_criteria(self, *args, **kwargs) -> bool: return True def episode_complete_criteria(self, *args, **kwargs) -> bool: return False def task_complete_criteria(self, *args, **kwargs) -> bool: return False def estimate_future_return(self, *args, **kwargs): return NotImplemented def save_or_sync_weights(self): self.save_model(save_path=self.training_context['save_path']) def training_model(self, current_episode=0, current_step=0, num_episodes=100, train_timing=None, done=False, batch_size=1, repeat_train=1): is_collect_data = False for i in range(repeat_train): data = None if self.use_experience_replay: data = self.experience_replay(batch_size) else: data = self.memory.memory[0] self.estimate_future_return(*data) self.training_context['skip_generate_output'] = True if 'step' in train_timing: current_step = current_step * repeat_train + i if done: total_batch = current_step * repeat_train + i + 1 is_collect_data = True else: total_batch = current_step * repeat_train + i + 10 elif 'episode' in train_timing: current_step = i total_batch = repeat_train super(PolicyBase, self).train_model( self.training_context['train_data'], self.training_context['test_data'], current_epoch=current_episode, current_batch=current_step, total_epoch=num_episodes, total_batch=total_batch, is_collect_data=True, is_print_batch_progress=False, is_print_epoch_progress=False, log_gradients=False, log_weights=False, accumulate_grads=(current_step * repeat_train + 1) % self.accumulation_steps != 0) self.save_or_sync_weights() def play(self, num_episodes, batch_size=1, min_replay_samples=1, print_progess_frequency=5, training=True, train_timing='on_episode_start', train_every_nstep=1, repeat_train=1, need_render=True): if train_timing not in [ 'on_episode_start', 'on_step_end', 'on_step_start' ]: raise ValueError( 'Only on_episode_start,on_step_end are valid train_timing options' ) if training: self._model.train() else: self._model.eval() if self.use_experience_replay: self.collect_samples(min_replay_samples=min_replay_samples) else: self.collect_samples( min_replay_samples=1, need_render=True if self.replay_unit == 'episode' else False) print('start train....') self.state_pool = [] self.reward_pool = [] self.action_pool = [] self.total_reward = 0 self.t = 0 self.i_episode = 0 if hasattr(self.env, 'recording_enabled'): self.env.recording_enabled = True for i_episode in range(num_episodes): self.i_episode = i_episode if training and train_timing == 'on_episode_start' and i_episode % train_every_nstep == 0: self.training_model(i_episode, 0, num_episodes=num_episodes, repeat_train=repeat_train, train_timing=train_timing, batch_size=batch_size) self.env.reset() self.total_rewards = 0 state = self.get_observation() for t in count(): self.t = t # # Train on_step_start # if training and train_timing == 'on_step_start' and t % train_every_nstep == 0: # self.training_model(i_episode, t,num_episodes=num_episodes, repeat_train=repeat_train, batch_size=batch_size) action = self.select_action(state, model_only=True) observation, reward, done, info = self.get_rewards(action) self.total_rewards += reward next_state = self.get_observation() if not done else None if need_render: self.env.render() if self.replay_unit == 'step': if self.push_into_memory_criteria( state, action, next_state, reward) or done: self.memory.push(state, action, next_state, reward) elif self.replay_unit == 'episode': self.state_pool.append(state) self.action_pool.append(action) self.reward_pool.append(reward) if done: if self.push_into_memory_criteria( self.state_pool, self.action_pool, None, self.reward_pool): self.memory.push(self.state_pool, self.action_pool, None, self.reward_pool) self.state_pool = [] self.action_pool = [] self.reward_pool = [] complete = self.episode_complete_criteria() # Train on_step_end if training and train_timing == 'on_step_end' and t % train_every_nstep == 0: self.training_model(i_episode, t, num_episodes=num_episodes, done=done or complete, repeat_train=repeat_train, train_timing=train_timing, batch_size=batch_size, accumulate_grads=accumulate_grads) state = next_state if done or complete: self.epoch_metric_history.collect( 'rewards', i_episode, float(self.total_rewards)) self.epoch_metric_history.collect('t', i_episode, float(t + 1)) if self.use_experience_replay: self.epoch_metric_history.collect( 'replay_buffer_utility', i_episode, float(len(self.memory)) / self.memory.capacity) if print_progess_frequency == 1 or ( i_episode > 0 and (i_episode + 1) % print_progess_frequency == 0): self.print_epoch_progress(print_progess_frequency) # n1 = self.action_logs['model'][0] # n2 = self.action_logs['model'][1] # n3 = self.action_logs['random'][0] # n4 = self.action_logs['random'][1] # print('model: 0:{0} 1:{1} random: 0:{2} 1:{3} random: {4}'.format(float(n1) / (n1 + n2), float(n2) / (n1 + n2), float(n3) / builtins.max(n3 + n4,1), # float(n4) / builtins.max(n3 + n4,1), float(n3 + n4) / builtins.max(n1 + n2 + n3 + n4,1))) # # self.action_logs = OrderedDict() # self.action_logs['model'] = OrderedDict() # self.action_logs['random'] = OrderedDict() # self.action_logs['model'][0] = 0 # self.action_logs['model'][1] = 0 # self.action_logs['random'][0] = 0 # self.action_logs['random'][1] = 0 # 定期繪製損失函數以及評估函數對時間的趨勢圖 if i_episode > 0 and (i_episode + 1) % ( 5 * print_progess_frequency) == 0: loss_metric_curve( self.epoch_loss_history, self.epoch_metric_history, metrics_names=list( self.epoch_metric_history.keys()), calculate_base='epoch', imshow=True) if self.task_complete_criteria(): self.save_model( save_path=self.training_context['save_path']) print( 'episode {0} meet task complete criteria, training finish! ' .format(i_episode)) return True break print('Complete') self.env.render() self.env.close() def learn(self, num_episodes, batch_size=1, min_replay_samples=1, print_progess_frequency=5, train_timing='on_episode_start', train_every_nstep=1, repeat_train=1, accumulate_grads=False): self.play(num_episodes=num_episodes, batch_size=batch_size, min_replay_samples=min_replay_samples, print_progess_frequency=print_progess_frequency, training=True, train_timing=train_timing, train_every_nstep=train_every_nstep, repeat_train=repeat_train, need_render=True) def resume(self, num_episodes=3000, **kwargs): pass @property def preprocess_flow(self): return self._preprocess_flow @preprocess_flow.setter def preprocess_flow(self, value): self._preprocess_flow = value objecttype = None if isinstance(self.model.input_spec, TensorSpec): objecttype = self.model.input_spec.object_type # super()._initial_graph(inputs=to_tensor(self.get_observation()).repeat_elements(2, 0), output=deepcopy(self.network)) self.setting_network() if objecttype is not None: self.inputs.value_list[0].object_type = objecttype self.model.input_spec.object_type = objecttype self.env.reset() def data_preprocess(self, img_data): if self._model is not None: self._model.input_spec.object_type = ObjectType.rgb if not hasattr(self, '_preprocess_flow') or self._preprocess_flow is None: self._preprocess_flow = [] if img_data.ndim == 4: return to_tensor( to_numpy([self.data_preprocess(im) for im in img_data])) if len(self._preprocess_flow) == 0: return image_backend_adaption(img_data) if isinstance(img_data, np.ndarray): for fc in self._preprocess_flow: if self._model is not None and self.signature is not None and len( self.signature ) > 1 and self._model.input_spec is not None: img_data = fc(img_data, spec=self._model.input_spec) else: img_data = fc(img_data) img_data = image_backend_adaption(img_data) if self._model.input_spec is None: self._model.input_spec = TensorSpec(shape=tensor_to_shape( to_tensor(img_data), need_exclude_batch_axis=True, is_singleton=True), object_type=ObjectType.rgb, name='input') self.input_shape = self._model.input_spec.shape[1:] return img_data else: return img_data def do_on_batch_end(self): self.training_context['time_batch_progress'] += ( time.time() - self.training_context['time_batch_start']) self.training_context['time_epoch_progress'] += ( time.time() - self.training_context['time_batch_start']) self.training_context['steps'] += 1 if (self.training_context['steps'] + 1) % _session.epoch_equivalent == 0: if self.warmup > 0 and self.warmup == ( self.training_context['steps'] + 1) // _session.epoch_equivalent: self.adjust_learning_rate(self.training_context['base_lr']) self.warmup = 0
def collect_samples(self, min_replay_samples, need_render=False) -> bool: if self.memory is None: self.memory = ReplayBuffer(10000) progress_inteval = int(min_replay_samples / 50) * 5 self.state_pool = [] self.reward_pool = [] self.action_pool = [] for i_episode in range(min_replay_samples): self.env.reset() state = self.get_observation() for t in count(): action = self.select_action( state, model_only=True if self.action_strategy == ActionStrategy.OnPolicy else False) _observation, reward, done, info = self.get_rewards(action) if need_render: self.env.render() next_state = None if not done: next_state = self.get_observation() if self.replay_unit == 'step': self.memory.push(state, action, next_state, reward) if len(self.memory) < min_replay_samples and len( self.memory) % progress_inteval == 0: print("Replay Samples:{0}".format(len(self.memory))) if len(self.memory) == min_replay_samples: # n1 = self.action_logs['model'][0] # n2 = self.action_logs['model'][1] # n3 = self.action_logs['random'][0] # n4 = self.action_logs['random'][1] # print('model: 0:{0} 1:{1} random: 0:{2} 1:{3} random: {4}'.format(float(n1) / (n1 + n2), float(n2) / (n1 + n2), float(n3) / (n3 + n4), # float(n4) / (n3 + n4), float(n3 + n4) / builtins.max(n1 + n2 + n3 + n4,1))) # # self.action_logs = OrderedDict() # self.action_logs['model'] = OrderedDict() # self.action_logs['random'] = OrderedDict() # self.action_logs['model'][0] = 0 # self.action_logs['model'][1] = 0 # self.action_logs['random'][0] = 0 # self.action_logs['random'][1] = 0 return True elif self.replay_unit == 'episode': self.state_pool.append(state) self.action_pool.append(action) self.reward_pool.append(reward) if done: self.memory.push(self.state_pool, self.action_pool, None, self.reward_pool) if len(self.memory) < min_replay_samples and len( self.memory) % progress_inteval == 0: print("Replay Samples:{0}".format(len( self.memory))) self.state_pool = [] self.action_pool = [] self.reward_pool = [] if len(self.memory) == min_replay_samples: return True break state = next_state if done: break return False