def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg) cfg.agent.params.obs_shape = self.env.observation_space.shape print(self.env.action_space.shape) cfg.agent.params.action_shape = (self.env.action_space.n,) cfg.agent.params.action_range = [ 0, 12 ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = utils.make_env(cfg) cfg.agent.params.obs_dim = self.env.observation_space.shape[0] cfg.agent.params.action_dim = self.env.action_space.shape[0] cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, int(cfg.replay_buffer_capacity), self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.train_envs, self.test_envs = utils.make_env(cfg) cfg.agent.params.obs_dim = self.train_envs[0].observation_space.shape[0] + cfg.noise_dims cfg.agent.params.action_dim = self.train_envs[0].action_space.shape[0] if cfg.agent.name != 'sac': cfg.agent.params.num_envs = cfg.num_train_envs cfg.agent.params.action_range = [ float(self.train_envs[0].action_space.low.min()), float(self.train_envs[0].action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.agent.seq_len = cfg.seq_len self.replay_buffer = MultiEnvReplayBuffer((cfg.agent.params.obs_dim,), # hard coded self.train_envs[0].action_space.shape, int(cfg.replay_buffer_capacity), self.device, num_envs=cfg.num_train_envs, seq_len=cfg.seq_len) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = [0] * cfg.num_train_envs
def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.model_dir = utils.make_dir(self.work_dir, 'model') self.buffer_dir = utils.make_dir(self.work_dir, 'buffer') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, action_repeat=cfg.action_repeat, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = dmc.make(cfg.env, cfg.frame_stack, cfg.action_repeat, cfg.seed) self.eval_env = dmc.make(cfg.env, cfg.frame_stack, cfg.action_repeat, cfg.seed + 1) obs_spec = self.env.observation_spec()['pixels'] action_spec = self.env.action_spec() cfg.agent.params.obs_shape = obs_spec.shape cfg.agent.params.action_shape = action_spec.shape cfg.agent.params.action_range = [ float(action_spec.minimum.min()), float(action_spec.maximum.max()) ] # exploration agent uses intrinsic reward self.expl_agent = hydra.utils.instantiate(cfg.agent, task_agnostic=True) # task agent uses extr extrinsic reward self.task_agent = hydra.utils.instantiate(cfg.agent, task_agnostic=False) self.task_agent.assign_modules_from(self.expl_agent) if cfg.load_pretrained: pretrained_path = utils.find_pretrained_agent( cfg.pretrained_dir, cfg.env, cfg.seed, cfg.pretrained_step) print(f'snapshot is taken from: {pretrained_path}') pretrained_agent = utils.load(pretrained_path) self.task_agent.assign_modules_from(pretrained_agent) # buffer for the task-agnostic phase self.expl_buffer = ReplayBuffer(obs_spec.shape, action_spec.shape, cfg.replay_buffer_capacity, self.device) # buffer for task-specific phase self.task_buffer = ReplayBuffer(obs_spec.shape, action_spec.shape, cfg.replay_buffer_capacity, self.device) self.eval_video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def __init__(self, cfg): self.work_dir = os.getcwd() """Hack to adjust action_repeat""" adjust_action_repeat_hack(cfg) print(f"CFG:\n{'-'*100}\n{cfg}\n{'-'*100}") self.cfg = cfg experiment_name = f"{cfg.full_title}_{cfg.run_id}" self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, save_wb=cfg.log_save_wandb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat, cfg=dict(flatten_cfg(cfg)), plot_project="drqtest", experiment=experiment_name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) print(f"ACTOR:\n{'-'*100}\n{self.agent.actor}\n{'-'*100}") print(f"CRITIC:\n{'-'*100}\n{self.agent.critic}\n{'-'*100}") self.replay_buffer = ReplayBuffer( self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device, use_aug=cfg.replay_buffer_augmentation) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def __init__( self, log_save_tb=True, log_frequency_step=10000, agent_name='drq', # device='cuda', device='cpu', env='cartpole_swingup', seed=1, image_size=84, action_repeat=8, frame_stack=3, replay_buffer_capacity=100000, image_pad=4, save_video=True): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.logger = Logger(self.work_dir, save_tb=log_save_tb, log_frequency=log_frequency_step, agent=agent_name, action_repeat=action_repeat) utils.set_seed_everywhere(seed) self.device = torch.device(device) self.env = make_env(env, seed, image_size, action_repeat, frame_stack) self.agent = DRQAgent( obs_shape=self.env.observation_space.shape, action_shape=self.env.action_space.shape, action_range=(float(self.env.action_space.low.min()), float(self.env.action_space.high.max())), device=self.device) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, replay_buffer_capacity, image_pad, self.device) self.video_recorder = VideoRecorder( self.work_dir if save_video else None) self.step = 0
def __init__(self, cfg): self.work_dir = os.getcwd() print(f"workspace: {self.work_dir}") self.cfg = cfg self.logger = Logger( self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat, ) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg, eval=False) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()), ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer( self.env.observation_space.shape, self.env.action_space.shape, self.env.state_space.shape, cfg.replay_buffer_capacity, self.cfg.image_size, self.agent.random_encoder, self.cfg.aug_type, self.cfg.use_drq, self.device, ) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg, self.logger) self.eval_env = gym.make(cfg.env) if "img_only" not in cfg.env: self.eval_env = DictToBoxWrapper(DictTransposeImage(self.eval_env)) else: self.eval_env = TransposeImage(self.eval_env) # env = utils.FrameStack(env, k=cfg.frame_stack) self.eval_env.seed(cfg.seed + 111) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def main(args): # Initialize environment env = init_env(args) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) video = VideoRecorder(video_dir if args.save_video else None, height=448, width=448, camera_id=args.video_camera_id) # Prepare agent assert torch.cuda.is_available(), 'must have cuda enabled' device = torch.device(args.device) # cropped_obs_shape = (3 * args.frame_stack, 84, 84) agent = make_agent(obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, action_range=[ float(env.action_space.low.min()), float(env.action_space.high.max()) ], device=device, args=args) agent.load(model_dir, args.load_checkpoint) # Evaluate agent without PAD print( f'Evaluating {args.work_dir} for {args.num_eval_episodes} episodes (mode: {args.mode})' ) eval_reward, eval_invs_pred_var = evaluate(env, agent, args, video) print('eval reward:', int(eval_reward)) print('eval inverse predictor variance: ', eval_invs_pred_var) # # Evaluate agent with PAD (if applicable) # pad_reward = None # if args.use_inv or args.use_curl or args.use_rot: # env = init_env(args) # print(f'Policy Adaptation during Deployment of {args.work_dir} for {args.pad_num_episodes} episodes ' # f'(mode: {args.mode})') # pad_reward = evaluate(env, agent, args, video, adapt=True) # print('pad reward:', int(pad_reward)) # Save results if args.eval_results: results_fp = os.path.join(args.work_dir, '{}.pt'.format(args.mode)) torch.save( { 'args': args, 'eval_reward': eval_reward, 'eval_invs_pred_var': eval_invs_pred_var }, results_fp) print('Saved results to', results_fp)
def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) gibson_config_filename = os.path.join( os.path.dirname(gibson2.__file__), '../examples/configs/hand_drawer.yaml') self.env = HandDrawerEnv(config_file=gibson_config_filename, mode='headless') self.env = utils.FrameStack(self.env, k=cfg.frame_stack) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) setSeedEverywhere(cfg.seed) self.device = torch.device(cfg.device) # self.env = utils.makeEnv(cfg) self.env = hydra.utils.call(cfg.env) cfg.agent.obs_dim = self.env.observation_space.shape[0] cfg.agent.action_dim = self.env.action_space.shape[0] cfg.agent.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] cfg.agent.n_step = cfg.replay_buffer.n_step # n-step experience replay self.agent = hydra.utils.instantiate(cfg.agent,_recursive_=False) self.replay_buffer = ReplayBuffer( capacity=cfg.replay_buffer.capacity, obs_shape = self.env.observation_space.shape, action_shape = self.env.action_space.shape, obs_dtype = self.env.observation_space.dtype, action_dtype = self.env.action_space.dtype, n_step = cfg.replay_buffer.n_step, # n-step experience replay discount=cfg.agent.discount, # per step discount device = self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir + "_" + self.cfg.env + "_eval2k_effective_{}_seed_{}".format( self.cfg.effective_aug, self.cfg.seed), save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat) self.effective_aug = self.cfg.effective_aug utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device, self.effective_aug) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = dmc.make_meta(cfg.env, cfg.episode_length, cfg.seed) self.eval_env = dmc.make_meta(cfg.env, cfg.episode_length, cfg.seed + 1) obs_spec = self.env.observation_spec()['features'] action_spec = self.env.action_spec() cfg.agent.params.obs_shape = obs_spec.shape cfg.agent.params.action_shape = action_spec.shape cfg.agent.params.action_range = [ float(action_spec.minimum.min()), float(action_spec.maximum.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = MetaReplayBuffer(cfg.train_tasks, obs_spec.shape, action_spec.shape, cfg.replay_buffer_capacity, self.device) self.eval_video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def __init__(self, cfg): self.work_dir = '/media/trevor/mariadb/thesis/' print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device, self.cfg.env) # obs_shape = (3 * 3, 84, 84) # pre_aug_obs_shape = (3 * 3, 100, 100) # # self.replay_buffer = ReplayBuffer( # obs_shape=pre_aug_obs_shape, # action_shape=self.env.action_space.shape, # capacity=cfg.replay_buffer_capacity, # batch_size=cfg.batch_size, # device=self.device, # image_size=84, # pre_image_size=100, # ) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def eval(self): average_episode_reward = 0 average_episode_success = 0 video_recorder = VideoRecorder() video_recorder.init() for episode in range(self.num_eval_episodes): obs_dict = self.env.reset() obs = obs_dict[self.observation_key] obs_g = obs_dict[self.desired_goal_key] done = False episode_reward = 0 episode_step = 0 while not done: action = self.agent.act(obs, obs_g, sample=True) next_obs_dict, reward, done, info = self.env.step(action) done = float(done) episode_reward += reward achieved_goal = next_obs_dict[self.achieved_goal_key] obs = next_obs_dict[self.observation_key] obs_g = next_obs_dict[self.desired_goal_key] episode_step += 1 video_recorder.record(next_obs_dict) average_episode_reward += episode_reward / self.num_eval_episodes average_episode_success += float( info['is_success']) / self.num_eval_episodes video_recorder.save(f'{self.step}.mp4') tune.report( eval_reward=average_episode_reward, eval_is_success=average_episode_success, timesteps_this_iter=0, )
def main(args): # Initialize environment env = init_env(args) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) video = VideoRecorder(video_dir if args.save_video else None, height=448, width=448) # Prepare agent assert torch.cuda.is_available(), 'must have cuda enabled' cropped_obs_shape = (3 * args.frame_stack, 84, 84) agent = make_agent(obs_shape=cropped_obs_shape, action_shape=env.action_space.shape, args=args) agent.load(model_dir, args.load_checkpoint) # Evaluate agent without PAD print( f'Evaluating {args.work_dir} for {args.pad_num_episodes} episodes (mode: {args.mode})' ) eval_reward = evaluate(env, agent, args, video) print('eval reward:', int(eval_reward)) # Evaluate agent with PAD (if applicable) pad_reward = None if args.use_inv or args.use_curl or args.use_rot: env = init_env(args) print( f'Policy Adaptation during Deployment of {args.work_dir} for {args.pad_num_episodes} episodes ' f'(mode: {args.mode})') pad_reward = evaluate(env, agent, args, video, adapt=True) print('pad reward:', int(pad_reward)) # Save results results_fp = os.path.join(args.work_dir, f'{args.mode}_pad.pt') torch.save( { 'args': args, 'eval_reward': eval_reward, 'pad_reward': pad_reward }, results_fp) print('Saved results to', results_fp)
def main(args): # Initialize environment utils.set_seed_everywhere(args.seed) env = make_pad_env( domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, episode_length=args.episode_length, action_repeat=args.action_repeat, mode=args.mode ) utils.make_dir(args.work_dir) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) video = VideoRecorder(video_dir if args.save_video else None) # Prepare agent assert torch.cuda.is_available(), 'must have cuda enabled' replay_buffer = utils.ReplayBuffer( obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, capacity=args.train_steps, batch_size=args.batch_size ) cropped_obs_shape = (3*args.frame_stack, 84, 84) agent = make_agent( obs_shape=cropped_obs_shape, action_shape=env.action_space.shape, args=args ) L = Logger(args.work_dir, use_tb=False) episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.train_steps+1): if done: if step > 0: L.log('train/duration', time.time() - start_time, step) start_time = time.time() L.dump(step) # Evaluate agent periodically if step % args.eval_freq == 0: print('Evaluating:', args.work_dir) L.log('eval/episode', episode, step) evaluate(env, agent, video, args.eval_episodes, L, step) # Save agent periodically if step % args.save_freq == 0 and step > 0: if args.save_model: agent.save(model_dir, step) L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 L.log('train/episode', episode, step) # Sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) # Run training update if step >= args.init_steps: num_updates = args.init_steps if step == args.init_steps else 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) # Take step next_obs, reward, done, _ = env.step(action) done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(done) replay_buffer.add(obs, action, reward, next_obs, done_bool) episode_reward += reward obs = next_obs episode_step += 1
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd().split('runs')[0] + 'runs/' self.work_dir = self.work_dir + \ '2020.10.21/jaco_reach_site_features_drq_agent.cls=agents.drq_agent.DRQAgent,agent.name=drq,batch_size=64,lr=0.005/seed=0/' self.model_dir = self.work_dir + '/agent_model' print(f'workspace: {self.work_dir}') self.cfg = cfg self.log_eval_dir = self.work_dir + '/eval_standalone' # Use a separate eval dir to avoid overwriting training files if not os.path.exists(self.log_eval_dir): os.makedirs(self.log_eval_dir) self.logger = Logger(self.log_eval_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat, overwrite=True) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) # Environment Sampler self.num_train_envs = cfg.num_envs self.env_sampler = utils.EnvSampler(cfg, False, False, work_dir=self.work_dir) experiment_identifier = self.work_dir.split('runs')[1] self.eval_envs = self.env_sampler.sample_eval_envs( experiment_identifier) env_sample_key = list(self.eval_envs.keys())[0] sample_env = self.eval_envs[env_sample_key] cfg.agent.params.obs_shape = sample_env.observation_space.shape cfg.agent.params.action_shape = sample_env.action_space.shape cfg.agent.params.action_range = [ float(sample_env.action_space.low.min()), float(sample_env.action_space.high.max()) ] if cfg.lowobs_append: if cfg.env == 'jaco_reach_site_features': cfg.agent.params.lstate_shape = 49 else: cfg.agent.params.lstate_shape = 9 else: cfg.agent.params.lstate_shape = 0 self.agent = hydra.utils.instantiate(cfg.agent) self.render_train_samples = True if self.render_train_samples: if cfg.env.startswith('jaco'): height = 256 width = 256 else: height = width = 500 from PIL import Image for env_idx, env in self.eval_envs.items(): name = 'StandAloneEval_Unseen_Environment_' + str( env_idx) + '.png' img_path = self.work_dir + name env.reset() obs = env.render(mode='rgb_array', height=height, width=width) im = Image.fromarray(obs) im.save(img_path) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None, phase='eval_standalone') self.reload_weights = cfg.reload_weights self.train_vid_interval = cfg.train_vid_interval self.eval_trials = 100 self.step = 0 def evaluate(self): average_episode_reward = 0 for episode in range(self.eval_trials): print('Episode Trial ', episode) self.video_recorder.init(enabled=True) eval_env = self.eval_envs[random.sample(list(self.eval_envs), 1)[0]] obs = eval_env.reset() done = False episode_reward = 0 episode_step = 0 while (episode_step <= eval_env._max_episode_steps - 1): with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, _, _ = eval_env.step(action) self.video_recorder.record(eval_env) episode_reward += reward episode_step += 1 self.step += 1 if done: break average_episode_reward += episode_reward print('Episode Reward ', episode_reward) self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.eval_trials self.logger.log('eval_standalone/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step, ty='eval_standalone') def run(self): if os.path.exists(self.model_dir): latest_step = utils.get_latest_file(self.model_dir) self.agent.load(self.model_dir, latest_step) else: raise ValueError('Could not reload weights!') self.evaluate()
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir + "_" + self.cfg.env + "_eval2k_effective_{}_seed_{}".format( self.cfg.effective_aug, self.cfg.seed), save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat) self.effective_aug = self.cfg.effective_aug utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device, self.effective_aug) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def evaluate(self): average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, info = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward episode_step += 1 average_episode_reward += episode_reward self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) def run(self): episode, episode_reward, episode_step, done = 0, 0, 1, True start_time = time.time() while self.step < self.cfg.num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump( self.step, save=(self.step > self.cfg.num_seed_steps)) # evaluate agent periodically if self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() self.logger.log('train/episode_reward', episode_reward, self.step) obs = self.env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # sample action for data collection if self.step < self.cfg.num_seed_steps: action = self.env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=True) # run training update if self.step >= self.cfg.num_seed_steps: for _ in range(self.cfg.num_train_iters): self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, info = self.env.step(action) # allow infinite bootstrap done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) obs = next_obs episode_step += 1 self.step += 1
def __init__(self, cfg): self.work_dir = os.getcwd().split('runs')[0] + 'runs/' self.work_dir = self.work_dir + \ '2020.10.21/jaco_reach_site_features_drq_agent.cls=agents.drq_agent.DRQAgent,agent.name=drq,batch_size=64,lr=0.005/seed=0/' self.model_dir = self.work_dir + '/agent_model' print(f'workspace: {self.work_dir}') self.cfg = cfg self.log_eval_dir = self.work_dir + '/eval_standalone' # Use a separate eval dir to avoid overwriting training files if not os.path.exists(self.log_eval_dir): os.makedirs(self.log_eval_dir) self.logger = Logger(self.log_eval_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat, overwrite=True) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) # Environment Sampler self.num_train_envs = cfg.num_envs self.env_sampler = utils.EnvSampler(cfg, False, False, work_dir=self.work_dir) experiment_identifier = self.work_dir.split('runs')[1] self.eval_envs = self.env_sampler.sample_eval_envs( experiment_identifier) env_sample_key = list(self.eval_envs.keys())[0] sample_env = self.eval_envs[env_sample_key] cfg.agent.params.obs_shape = sample_env.observation_space.shape cfg.agent.params.action_shape = sample_env.action_space.shape cfg.agent.params.action_range = [ float(sample_env.action_space.low.min()), float(sample_env.action_space.high.max()) ] if cfg.lowobs_append: if cfg.env == 'jaco_reach_site_features': cfg.agent.params.lstate_shape = 49 else: cfg.agent.params.lstate_shape = 9 else: cfg.agent.params.lstate_shape = 0 self.agent = hydra.utils.instantiate(cfg.agent) self.render_train_samples = True if self.render_train_samples: if cfg.env.startswith('jaco'): height = 256 width = 256 else: height = width = 500 from PIL import Image for env_idx, env in self.eval_envs.items(): name = 'StandAloneEval_Unseen_Environment_' + str( env_idx) + '.png' img_path = self.work_dir + name env.reset() obs = env.render(mode='rgb_array', height=height, width=width) im = Image.fromarray(obs) im.save(img_path) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None, phase='eval_standalone') self.reload_weights = cfg.reload_weights self.train_vid_interval = cfg.train_vid_interval self.eval_trials = 100 self.step = 0
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) utils.set_seed_everywhere(args.seed) pre_transform_image_size = args.pre_transform_image_size if 'crop' in args.data_augs else args.image_size pre_image_size = args.pre_transform_image_size # record the pre transform image size for translation env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=pre_transform_image_size, width=pre_transform_image_size, frame_skip=args.action_repeat) env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) env_name = args.domain_name + '-' + args.task_name exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b' \ + str(args.batch_size) + '-s' + str(args.seed) + '-' + args.encoder_type args.work_dir = args.work_dir + '/' + exp_name utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': obs_shape = (3 * args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = (3 * args.frame_stack, pre_transform_image_size, pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, pre_image_size=pre_image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.num_train_steps): # evaluate agent periodically if step % args.eval_freq == 0: L.log('eval/episode', episode, step) evaluate(env, agent, video, args.num_eval_episodes, L, step, args) if args.save_model: agent.save_curl(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) if done: if step > 0: if step % args.log_interval == 0: L.log('train/duration', time.time() - start_time, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 if step % args.log_interval == 0: L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs / 255.) # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) next_obs, reward, done, _ = env.step(action) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1
def main(args): # Set seed utils.set_seed_everywhere(args.seed) # Initialize environments gym.logger.set_level(40) env = make_env(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed + 42, episode_length=args.episode_length, action_repeat=args.action_repeat, mode=args.eval_mode) # Set working directory work_dir = os.path.join(args.log_dir, args.domain_name + '_' + args.task_name, args.algorithm, str(args.seed)) print('Working directory:', work_dir) assert os.path.exists( work_dir), 'specified working directory does not exist' model_dir = utils.make_dir(os.path.join(work_dir, 'model')) video_dir = utils.make_dir(os.path.join(work_dir, 'video')) video = VideoRecorder(video_dir if args.save_video else None, height=448, width=448) # Check if evaluation has already been run results_fp = os.path.join(work_dir, args.eval_mode + '.pt') assert not os.path.exists( results_fp), f'{args.eval_mode} results already exist for {work_dir}' # Prepare agent assert torch.cuda.is_available(), 'must have cuda enabled' cropped_obs_shape = (3 * args.frame_stack, 84, 84) agent = make_agent(obs_shape=cropped_obs_shape, action_shape=env.action_space.shape, args=args) agent = torch.load(os.path.join(model_dir, str(args.train_steps) + '.pt')) agent.train(False) print( f'\nEvaluating {work_dir} for {args.eval_episodes} episodes (mode: {args.eval_mode})' ) reward = evaluate(env, agent, video, args.eval_episodes, args.eval_mode) print('Reward:', int(reward)) adapt_reward = None if args.algorithm == 'pad': env = make_env(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed + 42, episode_length=args.episode_length, action_repeat=args.action_repeat, mode=args.eval_mode) adapt_reward = evaluate(env, agent, video, args.eval_episodes, args.eval_mode, adapt=True) print('Adapt reward:', int(adapt_reward)) # Save results torch.save({ 'args': args, 'reward': reward, 'adapt_reward': adapt_reward }, results_fp) print('Saved results to', results_fp)
class Workspace(object): def __init__( self, log_save_tb=True, log_frequency_step=10000, agent_name='drq', # device='cuda', device='cpu', env='cartpole_swingup', seed=1, image_size=84, action_repeat=8, frame_stack=3, replay_buffer_capacity=100000, image_pad=4, save_video=True): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.logger = Logger(self.work_dir, save_tb=log_save_tb, log_frequency=log_frequency_step, agent=agent_name, action_repeat=action_repeat) utils.set_seed_everywhere(seed) self.device = torch.device(device) self.env = make_env(env, seed, image_size, action_repeat, frame_stack) self.agent = DRQAgent( obs_shape=self.env.observation_space.shape, action_shape=self.env.action_space.shape, action_range=(float(self.env.action_space.low.min()), float(self.env.action_space.high.max())), device=self.device) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, replay_buffer_capacity, image_pad, self.device) self.video_recorder = VideoRecorder( self.work_dir if save_video else None) self.step = 0 def evaluate( self, num_eval_episodes=10, ): average_episode_reward = 0 for episode in range(num_eval_episodes): obs = self.env.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, info = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward episode_step += 1 average_episode_reward += episode_reward self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) def run(self, num_train_steps=1000000, num_train_iters=1, num_seed_steps=1000, eval_frequency=5000): episode, episode_reward, episode_step, done = 0, 0, 1, True start_time = time.time() while self.step < num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump(self.step, save=(self.step > num_seed_steps)) # evaluate agent periodically if self.step % eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() self.logger.log('train/episode_reward', episode_reward, self.step) obs = self.env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # sample action for data collection if self.step < num_seed_steps: action = self.env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=True) # run training update if self.step >= num_seed_steps: for _ in range(num_train_iters): self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, info = self.env.step(action) # allow infinite bootstrap done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) obs = next_obs episode_step += 1 self.step += 1
def main(): args = parse_args() utils.set_seed_everywhere(args.seed) # Robot stuff action_space = ActionSpace.DELTA_EE_POSE_IMPEDANCE blocking_action = True env = RobotEnv(name='peg_in_hole', simulation=True, action_space=action_space, isotropic_gains=True, render=False, blocking_action=blocking_action, rotation_axis=(0, 0, 1), observation_type=dict(camera=1, q=0, dq=0, tau=0, x=0, dx=0)) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the dmc2gym wrapper standardizes actions #assert env.action_space.low.min() >= -1 #assert env.action_space.high.max() <= 1 replay_buffer = utils.ReplayBuffer( obs_shape=env.observation_space['camera'], action_shape=env.action_space.shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device) agent = make_agent(obs_shape=env.observation_space['camera'], action_shape=env.action_space.shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, prev_episode_reward, done = 0, 0, 0, True start_time = time.time() for step in range(args.num_train_steps): if done: if step > 0: L.log('train/duration', time.time() - start_time, step) start_time = time.time() L.dump(step) # evaluate agent periodically if step % args.eval_freq == 0 and step > 0: L.log('eval/episode', episode, step) evaluate(env, agent, video, args.num_eval_episodes, L, step) if args.save_model: agent.save(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) L.log('train/episode_reward', episode_reward, step) env.step(np.array([0, 0, 0.1, 0, 0, 0])) # Prevent getting stuck obs = env.reset() done, episode_reward, episode_step = False, 0, 0 episode += 1 L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) temp = action print("Temp action: {}".format(temp)) action = np.multiply(action, env.action_space.high) # run training update if step >= args.init_steps: num_updates = args.init_steps if step == args.init_steps else 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) next_obs, reward, done, _ = env.step(action) print("E: {} | S: {} | R: {:.4f} | ER: {:.4f} | A: {}".format( episode, step, round(reward, 4), round(episode_reward, 4), action)) # Reset environment if agent gets stuck (stuck means for 100 steps no increase in reward) if step % 100 == 0 and step > 0: if np.abs( prev_episode_reward - episode_reward ) < 1e-5: # If change in reward is negligible after 100 steps restart env.step(np.array([0, 0, 0.1, 0, 0, 0])) obs = env.reset() prev_episode_reward = episode_reward # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) utils.set_seed_everywhere(args.seed) gibson_config_filename = os.path.join( os.path.dirname(gibson2.__file__), '../examples/configs/hand_drawer.yaml') env = HandDrawerEnv(config_file=gibson_config_filename, mode='headless') env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) env_name = args.domain_name + '-' + args.task_name exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b' \ + str(args.batch_size) + '-s' + str(args.seed) + '-' + args.encoder_type args.work_dir = args.work_dir + '/' + exp_name utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': obs_shape = (2 * args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = (2 * args.frame_stack, args.pre_transform_image_size, args.pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
def main(): args = parse_args() utils.set_seed_everywhere(args.seed) env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=args.image_size, width=args.image_size, frame_skip=args.action_repeat) env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) args.work_dir = os.path.join( args.work_dir, f'{args.domain_name}-{args.task_name}-seed{args.seed}-{datetime.now().strftime("%Y%m%d-%H%M")}' ) utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Using device: ", device) # the dmc2gym wrapper standardizes actions assert env.action_space.low.min() >= -1 assert env.action_space.high.max() <= 1 replay_buffer = utils.ReplayBuffer(obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device) agent = make_agent(obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.num_train_steps): if done: if step > 0: L.log('train/duration', time.time() - start_time, step) start_time = time.time() L.dump(step) # evaluate agent periodically if step % args.eval_freq == 0: L.log('eval/episode', episode, step) evaluate(env, agent, video, args.num_eval_episodes, L, step) if args.save_model: agent.save(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) # run training update if step >= args.init_steps: num_updates = args.init_steps if step == args.init_steps else 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) next_obs, reward, done, _ = env.step(action) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1,1000000) utils.set_seed_everywhere(args.seed) env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed) env.seed(args.seed) method = args.agent + " (H="+ str(args.k_step) +")" model_kind = "dynode_model" if args.agent == "DyNODE-SAC" else "nn_model" # make directory env_name = args.domain_name + '-' + args.task_name args.work_dir = args.work_dir + '/' + env_name + '/' + method + '/' + str(args.seed) utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w+') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape obs_shape = env.observation_space.shape replay_buffer = utils.ReplayBuffer(obs_shape=obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device) agent = DyNODESacAgent(obs_shape=obs_shape, action_shape=action_shape, device=device, model_kind = model_kind, kind=args.kind, step_MVE = args.k_step, hidden_dim=args.hidden_dim, discount=args.discount, init_temperature=args.init_temperature, alpha_lr=args.alpha_lr, alpha_beta=args.alpha_beta, actor_lr=args.actor_lr, actor_beta=args.actor_beta, actor_log_std_min=args.actor_log_std_min, actor_log_std_max=args.actor_log_std_max, critic_lr=args.critic_lr, critic_beta=args.critic_beta, critic_tau=args.critic_tau, critic_target_update_freq=args.critic_target_update_freq, model_lr=args.model_lr, log_interval=args.log_interval) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.num_train_steps): # evaluate agent periodically if step % args.eval_freq == 0: L.log('eval/episode', episode, step) evaluate(env, agent, video, args.num_eval_episodes, L, step, args) if args.save_model: agent.save_model(model_dir, step) if done: if step > 0: if step % args.log_interval == 0: L.log('train/duration', time.time() - start_time, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 if step % args.log_interval == 0: L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) if step >= args.model_warm_up: for _ in range(args.model_num_updates): agent.update_model(replay_buffer, L, step) # run training update if step >= args.init_steps: for _ in range(2): agent.update(replay_buffer, L, step) next_obs, reward, done, _ = env.step(action) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool, done) obs = next_obs episode_step += 1
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = utils.make_env(cfg) self.obs_shape = self.env.observation_space['observation'].shape self.goal_shape = self.env.observation_space['desired_goal'].shape cfg.agent.params.obs_dim = self.obs_shape[0] cfg.agent.params.goal_dim = self.goal_shape[0] cfg.agent.params.action_dim = self.env.action_space.shape[0] cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.obs_shape, self.goal_shape, self.env.action_space.shape, int(cfg.replay_buffer_capacity), self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def evaluate(self): for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.agent.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs['observation'], obs['desired_goal'], sample=False) obs, reward, done, _ = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward self.video_recorder.save(f'{self.step}.mp4') self.logger.log('eval/episode_reward', episode_reward, self.step) self.logger.dump(self.step) def run_her(self, path_buffer): #first_obs = path_buffer[0][0] #last_obs = path_buffer[-1][0] #first_goal = first_obs['achieved_goal'] #last_goal = last_obs['achieved_goal'] #goal_changed = np.mean(last_goal - first_goal)**2 > 1e-6 #if goal_changed: for n, ts in enumerate(path_buffer): # select goal id if self.cfg.her_strat == 'future': i = np.random.randint(n, len(path_buffer)) elif self.cfg.her_strat == 'last': i = -1 new_goal_obs = path_buffer[i][3] new_goal = new_goal_obs['achieved_goal'] # relabel obs, action, reward, next_obs, done, done_no_max = ts obs['desired_goal'] = new_goal next_obs['desired_goal'] = new_goal reward = self.env.compute_reward(next_obs['achieved_goal'], new_goal, None) self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) def run(self): episode, episode_reward, done = 0, 0, True start_time = time.time() path_buffer = [] while self.step < self.cfg.num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump( self.step, save=(self.step > self.cfg.num_seed_steps)) # evaluate agent periodically if self.step > 0 and self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) if self.cfg.save_model: self.agent.save() self.agent.load() self.evaluate() self.logger.log('train/episode_reward', episode_reward, self.step) obs = self.env.reset() self.agent.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # her if self.cfg.her_iters > 0 and len(path_buffer): for k in range(self.cfg.her_iters): self.run_her(path_buffer) path_buffer = [] # sample action for data collection if self.step < self.cfg.num_seed_steps: action = self.env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs['observation'], obs['desired_goal'], sample=True) # run training update if self.step >= self.cfg.num_seed_steps: self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, _ = self.env.step(action) # allow infinite bootstrap done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) path_buffer.append( [obs, action, reward, next_obs, done, done_no_max]) obs = next_obs episode_step += 1 self.step += 1
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) exp_id = str(int(np.random.random() * 100000)) utils.set_seed_everywhere(args.seed) env = env_wrapper.make(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.observation_type == 'pixel' or args.observation_type == 'hybrid'), cameras=args.cameras, height=args.pre_transform_image_size, width=args.pre_transform_image_size, frame_skip=args.action_repeat, reward_type=args.reward_type, change_model=args.change_model) env.seed(args.seed) if args.special_reset is not None: env.set_special_reset(args.special_reset) if args.demo_special_reset is not None: env.set_special_reset(args.demo_special_reset) if args.observation_type == 'hybrid': env.set_hybrid_obs() # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) if args.task_name is None: env_name = args.domain_name else: env_name = args.domain_name + '-' + args.task_name exp_name = args.reward_type + '-' + args.agent + '-' + args.encoder_type + '-' + args.data_augs exp_name += '-' + ts + '-' + env_name + '-im' + str( args.image_size) + '-b' + str(args.batch_size) + '-nu' + str( args.num_updates) if args.observation_type == 'hybrid': exp_name += '-hybrid' if args.change_model: exp_name += '-change_model' if args.bc_only: exp_name += '-bc_only' exp_name += '-s' + str(args.seed) exp_name += '-id' + exp_id args.work_dir = args.work_dir + '/' + exp_name utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) print("Working in directory:", args.work_dir) video = VideoRecorder(video_dir if args.save_video else None, camera_id=args.cameras[0]) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': cpf = 3 * len(args.cameras) obs_shape = (cpf * args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = (cpf * args.frame_stack, args.pre_transform_image_size, args.pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, hybrid_state_shape=env.hybrid_state_shape, load_dir=args.replay_buffer_load_dir) if args.demo_model_dir is not None: # collect demonstrations using a state-trained expert episode_step, done = 0, True state_obs, obs = None, None episode_success = False original_encoder_type = args.encoder_type args.encoder_type = 'identity' if isinstance(env, utils.FrameStack): original_env = env.env else: original_env = env expert_agent = make_agent( obs_shape=original_env.observation_space.shape, action_shape=action_shape, args=args, device=device, hybrid_state_shape=env.hybrid_state_shape) args.encoder_type = original_encoder_type expert_agent.load(args.demo_model_dir, args.demo_model_step) print('Collecting expert trajectories...') t = 0 while t < args.demo_samples: if done: episode_step = 0 episode_success = False if args.demo_special_reset is not None: env.reset(save_special_steps=True) special_steps_dict = env.special_reset_save obs_list = special_steps_dict['obs'] act_list = special_steps_dict['act'] reward_list = special_steps_dict['reward'] for i in range(len(act_list)): replay_buffer.add(obs_list[i], act_list[i], reward_list[i], obs_list[i + 1], False) episode_step += len(act_list) t += len(act_list) obs = obs_list[-1] state_obs = original_env._get_state_obs() else: obs = env.reset() state_obs = original_env._get_state_obs() action = expert_agent.sample_action(state_obs) next_obs, reward, done, info = env.step(action) if info.get('is_success'): episode_success = True state_obs = original_env._get_state_obs() # allow infinite bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1 t += 1 if args.success_demo_only and done and not episode_success: t -= episode_step replay_buffer.idx -= episode_step env.set_special_reset(args.special_reset) print('Starting with replay buffer filled to {}.'.format( replay_buffer.idx)) # args.init_steps = max(0, args.init_steps - args.replay_buffer_load_pi_t) # maybe tune this agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device, hybrid_state_shape=env.hybrid_state_shape) if args.model_dir is not None: agent.load(args.model_dir, args.model_step) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() def eval_and_save(): if args.save_model: agent.save_curl(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) if args.save_sac: agent.save(model_dir, step) L.log('eval/episode', episode, step) print('evaluating') evaluate(env, agent, video, args.num_eval_episodes, L, step, args) if args.warmup_cpc: print("Warming up cpc for " + str(args.warmup_cpc) + ' steps.') for i in range(args.warmup_cpc): agent.update_cpc_only(replay_buffer, L, step=0, ema=args.warmup_cpc_ema) print('Warmed up cpc.') if args.warmup_offline_sac: for i in range(args.warmup_offline_sac): agent.update_sac_only(replay_buffer, L, step=0) if args.bc_only: step = 0 for i in range(100): agent.train_bc(replay_buffer) step += 1 eval_and_save() return time_computing = 0 time_acting = 0 callback_fn = None step = 0 if args.synch_update: callback_fn = lambda: lambda: [ agent.update(replay_buffer, L, step, log_networks=nu == 0 and step % args.log_networks_freq == 0) for nu in range(args.num_updates) ] if step >= args.init_steps and not is_eval else 0 # pointers should all work properly, and execute in the proper frame if callback_fn is not None: env.env._env.env.set_callback( callback_fn) # envwrapper (camera), framestack, timelimit # for step in range(args.num_train_steps): while step < args.num_train_steps: # evaluate agent periodically if step % args.eval_freq == 0: is_eval = True eval_and_save() is_eval = False if done: if step > 0: if step % args.log_interval == 0: L.log('train/duration', time.time() - start_time, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log('train/episode_reward', episode_reward, step) time_tmp = time.time() obs = env.reset() time_acting += time.time() - time_tmp episode_reward = 0 episode_step = 0 episode += 1 if step % args.log_interval == 0: L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) if step == args.init_steps and args.demo_samples == 0: if args.warmup_cpc: for i in range(args.warmup_cpc): print("Warming up cpc for " + str(args.warmup_cpc) + ' steps.') agent.update_cpc_only(replay_buffer, L, step=0) print('Warmed up cpc.') # run training update time_tmp = time.time() if step >= args.init_steps and not args.synch_update: for nu in range(args.num_updates): agent.update(replay_buffer, L, step, log_networks=nu == 0 and step % args.log_networks_freq == 0) time_computing += time.time() - time_tmp time_tmp = time.time() next_obs, reward, done, _ = env.step(action) time_acting += time.time() - time_tmp # allow infinite bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1 step += 1 step = args.num_train_steps print("time spent computing:", time_computing) print("time spent acting:", time_acting) eval_and_save()
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() """Hack to adjust action_repeat""" adjust_action_repeat_hack(cfg) print(f"CFG:\n{'-'*100}\n{cfg}\n{'-'*100}") self.cfg = cfg experiment_name = f"{cfg.full_title}_{cfg.run_id}" self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, save_wb=cfg.log_save_wandb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat, cfg=dict(flatten_cfg(cfg)), plot_project="drqtest", experiment=experiment_name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) print(f"ACTOR:\n{'-'*100}\n{self.agent.actor}\n{'-'*100}") print(f"CRITIC:\n{'-'*100}\n{self.agent.critic}\n{'-'*100}") self.replay_buffer = ReplayBuffer( self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device, use_aug=cfg.replay_buffer_augmentation) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def evaluate(self): average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) obs, reward, done, info = self.env.step(action) self.video_recorder.record(self.env) episode_reward += reward episode_step += 1 average_episode_reward += episode_reward self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) def run(self): episode, episode_reward, episode_step, done = 0, 0, 1, True start_time = time.time() while self.step < self.cfg.num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump( self.step, save=(self.step > self.cfg.num_seed_steps)) # evaluate agent periodically if self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() self.logger.log('train/episode_reward', episode_reward, self.step) obs = self.env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # sample action for data collection if self.step < self.cfg.num_seed_steps: action = self.env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=True) # run training update if self.step >= self.cfg.num_seed_steps: for _ in range(self.cfg.num_train_iters): self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, info = self.env.step(action) # allow infinite bootstrap done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max) obs = next_obs episode_step += 1 self.step += 1