def real_time_lrp(conf): """Method to display feature relevance scores in real time. Args: conf: Dictionary consisting of configuration parameters. """ record_video = conf["playback"]["record_video"] webcam = Webcam() lrp = RelevancePropagation(conf) if record_video: recorder = VideoRecorder(conf) while True: t0 = time.time() frame = webcam.get_frame() heatmap = lrp.run(frame) heatmap = post_processing(frame, heatmap, conf) cv2.imshow("LRP", heatmap) if record_video: recorder.record(heatmap) t1 = time.time() fps = 1.0 / (t1 - t0) print("{:.1f} FPS".format(fps)) if cv2.waitKey(1) % 256 == 27: print("Escape pressed.") break if record_video: recorder.release() webcam.turn_off() cv2.destroyAllWindows()
class Workspace(): def __init__(self, cfg): self.cfg = cfg if not os.path.exists(cfg.workdir): os.makedirs(cfg.workdir) Tracer.FILE = cfg.workdir + "log_" + datetime.now().strftime( "%Y%m%d%H%M%S") + ".txt" self.progress = OO(epoch=1, exploration_noise=1, num_train_iteration=0, num_critic_update_iteration=0, num_actor_update_iteration=0) self.progress.update(episode=1, episode_step=0, global_step=0, env_top_reward=0, evaluate_reward=0, eval_top_reward=-9e99, train_running_reward=0, exploration_rate_epsilon=1) self.workdir = cfg.workdir self.meter = Meter(cfg, cfg.workdir) self.env = gym.make(cfg.env.name) env = self.env # 命令行参数、函数调用参数和reload配置文件中定义的参数,都会被这里定义的值重新覆盖 cfg.override( device=torch.device(cfg.device), agent=OO(device="${device}", ).__dict__, ) cfg.override(env=OO( spec=env.spec, reward_range=env.reward_range, state_space_dim=env.observation_space.shape, state_line_dim=np.prod(env.observation_space.shape), state_digest_dim=128, # 离散 1 或连续 0 action_discrete=1 if isinstance(env.action_space, gym.spaces. Discrete) else 0, # 动作维度 action_dim=env.action_space. n if isinstance(env.action_space, gym.spaces.Discrete ) else env.action_space.shape[0], # 动作取值下界 action_min=0 if isinstance(env.action_space, gym.spaces.Discrete ) else float(env.action_space.low[0]), # 动作取值上界 action_max=env.action_space. n if isinstance(env.action_space, gym.spaces.Discrete ) else float(env.action_space.high[0]), )) cfg.override(agent=OO( state_space_dim=cfg.env.state_space_dim, state_line_dim=cfg.env.state_line_dim, state_digest_dim=cfg.env.state_digest_dim, action_discrete=cfg.env.action_discrete, action_dim=cfg.env.action_dim, action_min=cfg.env.action_min, action_max=cfg.env.action_max, actor=OO( lr=1e-2, betas=[0.9, 0.999], ), critic=OO( lr=1e-2, betas=[0.9, 0.999], ), )) assert self.cfg.exploration_rate_init >= 0 and self.cfg.exploration_rate_init < 1 Tracer.trace(self.cfg) self.video_recorder = VideoRecorder() self.replay_buffer = ReplayBuffer(int(cfg.replay_buffer_capacity)) self.agent = cfg.agent_class(cfg, self.progress, self.meter, self.replay_buffer) self.temp_buffer = [] self.load() self.last_save_time = time.time() self.last_rest_time = time.time() self.last_log_time = time.time() self.last_meter_time = time.time() print("-" * 150) def save(self, best_model=False): def safe_torch_save(obj, filename): torch.save(obj, filename + '.tmp') if os.access(filename, os.F_OK): os.remove(filename) os.rename(filename + '.tmp', filename) agent_model = {} for (name, mod) in self.agent.modules.items(): agent_model[name] = mod.state_dict() safe_torch_save(agent_model, self.cfg.workdir + '/training_model.drl') if best_model: safe_torch_save(agent_model, self.cfg.workdir + '/agent_model.drl') progress = { 'progress': self.progress.__dict__, 'meters.train': self.meter.train_mg.data, 'meters.eval': self.meter.eval_mg.data, } for (name, optim) in self.agent.optimizers.items(): progress[name] = optim.state_dict() safe_torch_save(progress, self.cfg.workdir + '/progress.drl') safe_torch_save({ 'memory': self.replay_buffer.__dict__, }, self.cfg.workdir + '/replay_buffer.drl') if 'save_prompt_message' in self.cfg and self.cfg.save_prompt_message: Tracer.trace("Model has been saved...") def load(self): def safe_torch_load(filename): if os.path.exists(filename): return torch.load(filename) elif os.path.exists(filename + '.tmp'): return torch.load(filename + '.tmp') else: return None o = safe_torch_load(self.cfg.workdir + '/agent_model.drl') if o is not None: for (name, mod) in self.agent.modules.items(): if name in o: mod.load_state_dict(o[name]) o = safe_torch_load(self.cfg.workdir + '/progress.drl') if o is not None: for (name, optim) in self.agent.optimizers.items(): if name in o: optim.load_state_dict(o[name]) if o is not None and 'progress' in o: self.progress.__dict__.update(o['progress']) if o is not None and 'meters.train' in o: self.meter.train_mg.data = o['meters.train'] if o is not None and 'meters.eval' in o: self.meter.eval_mg.data = o['meters.eval'] o = safe_torch_load(self.cfg.workdir + '/replay_buffer.drl') if o is not None and 'memory' in o: self.replay_buffer.__dict__.update(o['memory']) Tracer.trace("model has been loaded...") def evaluate(self): self.video_recorder.init() average_episode_reward = 0 for episode in range(self.cfg.num_eval_epochs): state = self.env.reset() pre_state = state self.agent.reset() done = False episode_reward = 0 step = 0 while not done and step < self.cfg.eval_max_frame: with eval_mode(self.agent): action, action_prob = self.agent.action( pre_state, state, 0) prestate = state state, reward, done, _ = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward step += 1 work_time = time.time() - self.last_rest_time time.sleep((work_time * self.cfg.running_idle_rate) if work_time > 0 else 0.01) self.last_rest_time = time.time() average_episode_reward += episode_reward average_episode_reward /= self.cfg.num_eval_episodes self.meter.log('eval/episode_reward', average_episode_reward, self.progress.global_step) self.meter.dump(self.progress.global_step) return average_episode_reward def push_replay_buffer(self, info): if self.cfg.reward_forward == 0: self.replay_buffer.push(info) else: self.temp_buffer += [info] if info[-1][0] != 0: for i in reversed(range(len(self.temp_buffer) - 1)): if self.temp_buffer[i][-1][0] == 0: self.temp_buffer[i][-2][ 0] += self.cfg.reward_forward * self.temp_buffer[ i + 1][-2][0] else: self.temp_buffer[i][-2][0] += 0 for i in range(len(self.temp_buffer)): self.replay_buffer.push(self.temp_buffer[i]) self.temp_buffer.clear() def train_episode(self, train_max_frame, exploration_rate_epsilon): progress = self.progress state = self.env.reset() pre_state = state self.agent.reset() done_or_stop = False episode_reward = 0 episode_step = 0 self.meter.log('train/episode', progress.episode, progress.global_step) while not done_or_stop: if self.cfg.env.render: self.env.render() # sample action action, action_prob, state_digest = self.agent.action( pre_state, state, exploration_rate_epsilon) # one step next_state, reward, done, info = self.env.step(action) #有时候env输出的observation_space可能与定义的不一样 #因为action可以是多个连续动作 assert next_state.shape == self.env.observation_space.shape episode_reward += reward # if cfg.env_name == "Breakout-ram-v4": # s = next_state * (state != next_state) # s[90] = 0 # if np.sum(s) == 0: # done = True done_or_stop = done or train_max_frame > 0 and episode_step == train_max_frame self.push_replay_buffer( ([str(progress.global_step)], pre_state, state, state_digest, next_state, action, action_prob, [reward ], [float(done_or_stop)])) # run training update if self.replay_buffer.data_count >= self.cfg.batch_size: for i in range(self.cfg.batch_train_episodes): self.agent.update(self.cfg.batch_size) self.progress.num_train_iteration += 1 work_time = time.time() - self.last_rest_time time.sleep(( work_time * self.cfg.running_idle_rate) if work_time > 0 else 0.01) self.last_rest_time = time.time() if time.time() < self.last_save_time or time.time( ) - self.last_save_time >= self.cfg.save_exceed_seconds: self.save() self.last_save_time = time.time() pre_state = state state = next_state episode_step += 1 progress.global_step += 1 return episode_reward def train(self): def set_random_seed(seed): if seed is not None: random.seed(seed) self.env.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) np.random.seed(seed) # trainning set_random_seed(self.cfg.seed) self.agent.train() progress = self.progress while (self.cfg.num_train_epochs < 0 or progress.episode <= self.cfg.num_train_epochs) \ and (self.cfg.train_stop_reward < 0 or progress.train_running_reward < self.cfg.train_stop_reward) \ and progress.train_running_reward < progress.env_top_reward: self.cfg.reload() train_max_frame = self.cfg.train_max_frame if isinstance( self.env.spec.max_episode_steps, int) and self.env.spec.max_episode_steps < train_max_frame: train_max_frame = self.env.spec.max_episode_steps if self.progress.global_step < self.cfg.num_seed_steps: exploration_rate_epsilon = 1 elif self.progress.exploration_rate_epsilon == 1: exploration_rate_epsilon = self.progress.exploration_rate_epsilon = self.cfg.exploration_rate_init else: exploration_rate_epsilon = self.progress.exploration_rate_epsilon start_time = time.time() episode_reward = self.train_episode(train_max_frame, exploration_rate_epsilon) if progress.episode > 1: progress.train_running_reward = 0.05 * episode_reward + ( 1 - 0.05) * progress.train_running_reward else: progress.train_running_reward = episode_reward self.meter.log('train/epsilon', exploration_rate_epsilon, progress.global_step) self.meter.log('train/episode_reward', episode_reward, progress.global_step) self.meter.log('train/running_reward', progress.train_running_reward, progress.global_step) self.meter.log('train/duration', time.time() - start_time, progress.global_step) self.meter.dump(progress.global_step) if self.progress.exploration_rate_epsilon < 1: if self.progress.exploration_rate_epsilon > self.cfg.exploration_rate_min: self.progress.exploration_rate_epsilon *= self.cfg.exploration_rate_decay if self.progress.exploration_rate_epsilon < self.cfg.exploration_rate_min: self.progress.exploration_rate_epsilon = self.cfg.exploration_rate_min if progress.train_running_reward > progress.env_top_reward * self.cfg.eval_expect_top_reward_percent: self.meter.log('eval/episode', progress.episode, progress.global_step) self.progress.evaluate_reward = self.evaluate() if self.progress.evaluate_reward > self.progress.eval_top_reward: self.save(best_model=True) if self.progress.evaluate_reward > self.cfg.save_video_exceed_reward: vrfile = f'{self.workdir}/eval_top_reward.mp4' self.video_recorder.save(vrfile) self.progress.eval_top_reward = self.progress.evaluate_reward work_time = time.time() - self.last_rest_time time.sleep((work_time * self.cfg.running_idle_rate) if work_time > 0 else 0.01) self.last_rest_time = time.time() progress.episode += 1 # Finished One Episode return progress.train_running_reward def run(self): while True: if isinstance( self.env.spec.reward_threshold, float ) and self.env.spec.reward_threshold > self.progress.env_top_reward: self.progress.env_top_reward = self.env.spec.reward_threshold elif self.cfg.train_stop_reward > self.progress.env_top_reward: self.progress.env_top_reward = self.cfg.train_stop_reward elif self.progress.evaluate_reward > self.progress.env_top_reward: self.progress.env_top_reward = self.progress.evaluate_reward if self.progress.train_running_reward < self.progress.env_top_reward: self.progress.train_running_reward = self.train() else: self.progress.evaluate_reward = self.evaluate()