Exemplo n.º 1
0
def make_env(cfg):
    """Helper function to create dm_control environment"""
    if cfg.env == 'ball_in_cup_catch':
        domain_name = 'ball_in_cup'
        task_name = 'catch'
    elif cfg.env == 'point_mass_easy':
        domain_name = 'point_mass'
        task_name = 'easy'
    else:
        domain_name = cfg.env.split('_')[0]
        task_name = '_'.join(cfg.env.split('_')[1:])

    # per dreamer: https://github.com/danijar/dreamer/blob/02f0210f5991c7710826ca7881f19c64a012290c/wrappers.py#L26
    camera_id = 2 if domain_name == 'quadruped' else 0

    env = dmc2gym.make(domain_name=domain_name,
                       task_name=task_name,
                       seed=cfg.seed,
                       visualize_reward=False,
                       from_pixels=True,
                       height=cfg.image_size,
                       width=cfg.image_size,
                       frame_skip=cfg.action_repeat,
                       camera_id=camera_id)

    env = utils.FrameStack(env, k=cfg.frame_stack)

    env.seed(cfg.seed)
    assert env.action_space.low.min() >= -1
    assert env.action_space.high.max() <= 1

    return env
Exemplo n.º 2
0
def make_env(cfg):
    """Helper function to create dm_control environment"""
    if cfg.env == 'ball_in_cup_catch':
        domain_name = 'ball_in_cup'
        task_name = 'catch'
    elif cfg.env == 'point_mass_easy':
        domain_name = 'point_mass'
        task_name = 'easy'
    elif cfg.env == 'cartpole_two_poles':
        domain_name = 'cartpole'
        task_name = 'two_poles'
    elif cfg.env == 'cartpole_three_poles':
        domain_name = 'cartpole'
        task_name = 'three_poles'
    else:
        domain_name = cfg.env.split('_')[0]
        task_name = '_'.join(cfg.env.split('_')[1:])

    # per dreamer: https://github.com/danijar/dreamer/blob/02f0210f5991c7710826ca7881f19c64a012290c/wrappers.py#L26
    camera_id = 2 if domain_name == 'quadruped' else 0

    env = dmc2gym.make(domain_name=domain_name,
                       task_name=task_name,
                       seed=cfg.seed,
                       visualize_reward=False,
                       from_pixels=True,
                       height=cfg.image_size,
                       width=cfg.image_size,
                       frame_skip=cfg.action_repeat,
                       camera_id=camera_id)

    # env = dmc2gym_noisy.make(
    #     domain_name=domain_name,
    #     task_name=task_name,
    #     resource_files='../../../../../experiments/distractors/images/*.mp4',
    #     img_source='video',
    #     total_frames=10000,
    #     seed=cfg.seed,
    #     visualize_reward=False,
    #     from_pixels=True,
    #     height=84,
    #     width=84,
    #     frame_skip=cfg.action_repeat,
    #     camera_id=camera_id
    # )

    env = utils.FrameStack(env, k=cfg.frame_stack)

    env.seed(cfg.seed)
    assert env.action_space.low.min() >= -1
    assert env.action_space.high.max() <= 1

    return env
Exemplo n.º 3
0
def make_env(cfg):
    """Helper function to create dm_control environment"""
    if cfg.env == 'ball_in_cup_catch':
        domain_name = 'ball_in_cup'
        task_name = 'catch'
    elif cfg.env == 'point_mass_easy':
        domain_name = 'point_mass'
        task_name = 'easy'
    else:
        domain_name = cfg.env.split('_')[0]
        task_name = '_'.join(cfg.env.split('_')[1:])

    # per dreamer: https://github.com/danijar/dreamer/blob/02f0210f5991c7710826ca7881f19c64a012290c/wrappers.py#L26
    camera_id = 2 if domain_name == 'quadruped' else 0

#     env = dmc2gym.make(domain_name=domain_name,
#                        task_name=task_name,
#                        seed=cfg.seed,
#                        visualize_reward=False,
#                        from_pixels=True,
#                        height=cfg.image_size,
#                        width=cfg.image_size,
#                        frame_skip=cfg.action_repeat,
#                        camera_id=camera_id)
    # env = gym.make("CarRacing-v0")
    env_ = gym_tetris.make('TetrisA-v0')
    env = JoypadSpace(env_, SIMPLE_MOVEMENT)
    # env = MaxAndSkipEnv(env)
    # env._max_episode_steps = env_._max_episode_steps
    max_episode_steps = 10000
    env = WrapPyTorch(env, max_episode_steps)
    env.seed(cfg.seed)
    # print(env.ram)
    obs = env.reset()
    print(obs.shape)
    # env.seed(cfg.seed)

    env = utils.FrameStack(env, k=cfg.frame_stack)
    print("Init env done")
    # assert env.action_space.low.min() >= -1
    # assert env.action_space.high.max() <= 1

    return env
Exemplo n.º 4
0
def make_env(cfg, eval=False):
    """Helper function to create dm_control environment"""
    if cfg.env == "ball_in_cup_catch":
        domain_name = "ball_in_cup"
        task_name = "catch"
    elif cfg.env == "point_mass_easy":
        domain_name = "point_mass"
        task_name = "easy"
    else:
        domain_name = cfg.env.split("_")[0]
        task_name = "_".join(cfg.env.split("_")[1:])

    # per dreamer: https://github.com/danijar/dreamer/blob/02f0210f5991c7710826ca7881f19c64a012290c/wrappers.py#L26
    camera_id = 2 if domain_name == "quadruped" else 0

    if eval:
        seed = cfg.seed + 1004
    else:
        seed = cfg.seed

    env = dmc2gym.make(
        domain_name=domain_name,
        task_name=task_name,
        seed=seed,
        visualize_reward=False,
        from_pixels=True,
        height=cfg.pre_image_size,
        width=cfg.pre_image_size,
        frame_skip=cfg.action_repeat,
        camera_id=camera_id,
    )

    env = utils.FrameStack(env, k=cfg.frame_stack)

    env.seed(seed)
    assert env.action_space.low.min() >= -1
    assert env.action_space.high.max() <= 1

    return env
Exemplo n.º 5
0
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)

        gibson_config_filename = os.path.join(
            os.path.dirname(gibson2.__file__),
            '../examples/configs/hand_drawer.yaml')
        self.env = HandDrawerEnv(config_file=gibson_config_filename,
                                 mode='headless')
        self.env = utils.FrameStack(self.env, k=cfg.frame_stack)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          cfg.replay_buffer_capacity,
                                          self.cfg.image_pad, self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
Exemplo n.º 6
0
        loss = BCE + KLD
        return loss


env = dmc2gym.make(domain_name=args.domain_name,
                   task_name=args.task_name,
                   resource_files=args.resource_files,
                   img_source=args.img_source,
                   total_frames=10,
                   seed=args.seed,
                   visualize_reward=False,
                   from_pixels=(args.encoder_type == 'pixel'),
                   height=args.image_size,
                   width=args.image_size,
                   frame_skip=args.action_repeat)
env = utils.FrameStack(env, k=args.frame_stack)
vae = VAE(env.observation_space.shape)
train_dataset = torch.load('train_dataset.pt')
optimizer = torch.optim.Adam(vae.parameters(), lr=1e-3)
train_loader = torch.utils.data.DataLoader(train_dataset['obs'],
                                           batch_size=32,
                                           shuffle=True)

# training loop
for i in range(100):
    total_loss = []
    for obs_batch in train_loader:
        optimizer.zero_grad()
        loss = vae.train(obs_batch.to(device).float())
        loss.backward()
        optimizer.step()
Exemplo n.º 7
0
def main():
    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    utils.set_seed_everywhere(args.seed)

    pre_transform_image_size = args.pre_transform_image_size if 'crop' in args.data_augs else args.image_size
    pre_image_size = args.pre_transform_image_size  # record the pre transform image size for translation

    env = dmc2gym.make(domain_name=args.domain_name,
                       task_name=args.task_name,
                       seed=args.seed,
                       visualize_reward=False,
                       from_pixels=(args.encoder_type == 'pixel'),
                       height=pre_transform_image_size,
                       width=pre_transform_image_size,
                       frame_skip=args.action_repeat)

    env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    env_name = args.domain_name + '-' + args.task_name
    exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b'  \
    + str(args.batch_size) + '-s' + str(args.seed)  + '-' + args.encoder_type
    args.work_dir = args.work_dir + '/' + exp_name

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape

    if args.encoder_type == 'pixel':
        obs_shape = (3 * args.frame_stack, args.image_size, args.image_size)
        pre_aug_obs_shape = (3 * args.frame_stack, pre_transform_image_size,
                             pre_transform_image_size)
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
        pre_image_size=pre_image_size,
    )

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    for step in range(args.num_train_steps):
        # evaluate agent periodically

        if step % args.eval_freq == 0:
            L.log('eval/episode', episode, step)
            evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
            if args.save_model:
                agent.save_curl(model_dir, step)
            if args.save_buffer:
                replay_buffer.save(buffer_dir)

        if done:
            if step > 0:
                if step % args.log_interval == 0:
                    L.log('train/duration', time.time() - start_time, step)
                    L.dump(step)
                start_time = time.time()
            if step % args.log_interval == 0:
                L.log('train/episode_reward', episode_reward, step)

            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1
            if step % args.log_interval == 0:
                L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs / 255.)

        # run training update
        if step >= args.init_steps:
            num_updates = 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step)

        next_obs, reward, done, _ = env.step(action)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward
        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
Exemplo n.º 8
0
def main():
    args = parse_args()
    utils.set_seed_everywhere(args.seed)

    # Robot stuff
    action_space = ActionSpace.DELTA_EE_POSE_IMPEDANCE
    blocking_action = True
    env = RobotEnv(name='peg_in_hole',
                   simulation=True,
                   action_space=action_space,
                   isotropic_gains=True,
                   render=False,
                   blocking_action=blocking_action,
                   rotation_axis=(0, 0, 1),
                   observation_type=dict(camera=1, q=0, dq=0, tau=0, x=0,
                                         dx=0))

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # the dmc2gym wrapper standardizes actions
    #assert env.action_space.low.min()   >= -1
    #assert env.action_space.high.max()  <=  1

    replay_buffer = utils.ReplayBuffer(
        obs_shape=env.observation_space['camera'],
        action_shape=env.action_space.shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device)

    agent = make_agent(obs_shape=env.observation_space['camera'],
                       action_shape=env.action_space.shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, prev_episode_reward, done = 0, 0, 0, True
    start_time = time.time()
    for step in range(args.num_train_steps):
        if done:
            if step > 0:
                L.log('train/duration', time.time() - start_time, step)
                start_time = time.time()
                L.dump(step)

            # evaluate agent periodically
            if step % args.eval_freq == 0 and step > 0:
                L.log('eval/episode', episode, step)
                evaluate(env, agent, video, args.num_eval_episodes, L, step)
                if args.save_model:
                    agent.save(model_dir, step)
                if args.save_buffer:
                    replay_buffer.save(buffer_dir)

            L.log('train/episode_reward', episode_reward, step)

            env.step(np.array([0, 0, 0.1, 0, 0, 0]))  # Prevent getting stuck
            obs = env.reset()
            done, episode_reward, episode_step = False, 0, 0
            episode += 1

            L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)
                temp = action
                print("Temp action: {}".format(temp))
                action = np.multiply(action, env.action_space.high)

        # run training update
        if step >= args.init_steps:
            num_updates = args.init_steps if step == args.init_steps else 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step)

        next_obs, reward, done, _ = env.step(action)
        print("E: {}   | S: {}   | R: {:.4f} | ER: {:.4f} | A: {}".format(
            episode, step, round(reward, 4), round(episode_reward, 4), action))

        # Reset environment if agent gets stuck (stuck means for 100 steps no increase in reward)
        if step % 100 == 0 and step > 0:
            if np.abs(
                    prev_episode_reward - episode_reward
            ) < 1e-5:  # If change in reward is negligible after 100 steps restart
                env.step(np.array([0, 0, 0.1, 0, 0, 0]))
                obs = env.reset()
            prev_episode_reward = episode_reward

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward

        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
Exemplo n.º 9
0
def main():
    args = parse_args()
    utils.set_seed_everywhere(args.seed)

    env = dmc2gym.make(domain_name=args.domain_name,
                       task_name=args.task_name,
                       seed=args.seed,
                       visualize_reward=False,
                       from_pixels=(args.encoder_type == 'pixel'),
                       height=args.image_size,
                       width=args.image_size,
                       frame_skip=args.action_repeat)
    env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    args.work_dir = os.path.join(
        args.work_dir,
        f'{args.domain_name}-{args.task_name}-seed{args.seed}-{datetime.now().strftime("%Y%m%d-%H%M")}'
    )

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Using device: ", device)

    # the dmc2gym wrapper standardizes actions
    assert env.action_space.low.min() >= -1
    assert env.action_space.high.max() <= 1

    replay_buffer = utils.ReplayBuffer(obs_shape=env.observation_space.shape,
                                       action_shape=env.action_space.shape,
                                       capacity=args.replay_buffer_capacity,
                                       batch_size=args.batch_size,
                                       device=device)

    agent = make_agent(obs_shape=env.observation_space.shape,
                       action_shape=env.action_space.shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()
    for step in range(args.num_train_steps):
        if done:
            if step > 0:
                L.log('train/duration', time.time() - start_time, step)
                start_time = time.time()
                L.dump(step)

            # evaluate agent periodically
            if step % args.eval_freq == 0:
                L.log('eval/episode', episode, step)
                evaluate(env, agent, video, args.num_eval_episodes, L, step)
                if args.save_model:
                    agent.save(model_dir, step)
                if args.save_buffer:
                    replay_buffer.save(buffer_dir)

            L.log('train/episode_reward', episode_reward, step)

            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1

            L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)

        # run training update
        if step >= args.init_steps:
            num_updates = args.init_steps if step == args.init_steps else 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step)

        next_obs, reward, done, _ = env.step(action)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward

        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
Exemplo n.º 10
0
def main():
    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    exp_id = str(int(np.random.random() * 100000))
    utils.set_seed_everywhere(args.seed)

    env = env_wrapper.make(domain_name=args.domain_name,
                           task_name=args.task_name,
                           seed=args.seed,
                           visualize_reward=False,
                           from_pixels=(args.observation_type == 'pixel'
                                        or args.observation_type == 'hybrid'),
                           cameras=args.cameras,
                           height=args.pre_transform_image_size,
                           width=args.pre_transform_image_size,
                           frame_skip=args.action_repeat,
                           reward_type=args.reward_type,
                           change_model=args.change_model)

    env.seed(args.seed)
    if args.special_reset is not None:
        env.set_special_reset(args.special_reset)
    if args.demo_special_reset is not None:
        env.set_special_reset(args.demo_special_reset)

    if args.observation_type == 'hybrid':
        env.set_hybrid_obs()

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    if args.task_name is None:
        env_name = args.domain_name
    else:
        env_name = args.domain_name + '-' + args.task_name
    exp_name = args.reward_type + '-' + args.agent + '-' + args.encoder_type + '-' + args.data_augs
    exp_name += '-' + ts + '-' + env_name + '-im' + str(
        args.image_size) + '-b' + str(args.batch_size) + '-nu' + str(
            args.num_updates)
    if args.observation_type == 'hybrid':
        exp_name += '-hybrid'
    if args.change_model:
        exp_name += '-change_model'
    if args.bc_only:
        exp_name += '-bc_only'

    exp_name += '-s' + str(args.seed)

    exp_name += '-id' + exp_id
    args.work_dir = args.work_dir + '/' + exp_name
    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    print("Working in directory:", args.work_dir)

    video = VideoRecorder(video_dir if args.save_video else None,
                          camera_id=args.cameras[0])

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape

    if args.encoder_type == 'pixel':
        cpf = 3 * len(args.cameras)
        obs_shape = (cpf * args.frame_stack, args.image_size, args.image_size)
        pre_aug_obs_shape = (cpf * args.frame_stack,
                             args.pre_transform_image_size,
                             args.pre_transform_image_size)
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
        hybrid_state_shape=env.hybrid_state_shape,
        load_dir=args.replay_buffer_load_dir)

    if args.demo_model_dir is not None:  # collect demonstrations using a state-trained expert
        episode_step, done = 0, True
        state_obs, obs = None, None
        episode_success = False
        original_encoder_type = args.encoder_type
        args.encoder_type = 'identity'

        if isinstance(env, utils.FrameStack):
            original_env = env.env
        else:
            original_env = env

        expert_agent = make_agent(
            obs_shape=original_env.observation_space.shape,
            action_shape=action_shape,
            args=args,
            device=device,
            hybrid_state_shape=env.hybrid_state_shape)
        args.encoder_type = original_encoder_type
        expert_agent.load(args.demo_model_dir, args.demo_model_step)
        print('Collecting expert trajectories...')
        t = 0
        while t < args.demo_samples:
            if done:
                episode_step = 0
                episode_success = False
                if args.demo_special_reset is not None:
                    env.reset(save_special_steps=True)
                    special_steps_dict = env.special_reset_save
                    obs_list = special_steps_dict['obs']
                    act_list = special_steps_dict['act']
                    reward_list = special_steps_dict['reward']
                    for i in range(len(act_list)):
                        replay_buffer.add(obs_list[i], act_list[i],
                                          reward_list[i], obs_list[i + 1],
                                          False)
                    episode_step += len(act_list)
                    t += len(act_list)
                    obs = obs_list[-1]
                    state_obs = original_env._get_state_obs()
                else:
                    obs = env.reset()
                    state_obs = original_env._get_state_obs()

            action = expert_agent.sample_action(state_obs)
            next_obs, reward, done, info = env.step(action)
            if info.get('is_success'):
                episode_success = True
            state_obs = original_env._get_state_obs()

            # allow infinite bootstrap
            done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
                done)

            replay_buffer.add(obs, action, reward, next_obs, done_bool)

            obs = next_obs
            episode_step += 1
            t += 1

            if args.success_demo_only and done and not episode_success:
                t -= episode_step
                replay_buffer.idx -= episode_step

        env.set_special_reset(args.special_reset)

    print('Starting with replay buffer filled to {}.'.format(
        replay_buffer.idx))

    # args.init_steps = max(0, args.init_steps - args.replay_buffer_load_pi_t)  # maybe tune this

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device,
                       hybrid_state_shape=env.hybrid_state_shape)
    if args.model_dir is not None:
        agent.load(args.model_dir, args.model_step)
    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    def eval_and_save():
        if args.save_model:
            agent.save_curl(model_dir, step)
        if args.save_buffer:
            replay_buffer.save(buffer_dir)
        if args.save_sac:
            agent.save(model_dir, step)
        L.log('eval/episode', episode, step)
        print('evaluating')
        evaluate(env, agent, video, args.num_eval_episodes, L, step, args)

    if args.warmup_cpc:
        print("Warming up cpc for " + str(args.warmup_cpc) + ' steps.')
        for i in range(args.warmup_cpc):
            agent.update_cpc_only(replay_buffer,
                                  L,
                                  step=0,
                                  ema=args.warmup_cpc_ema)
        print('Warmed up cpc.')

    if args.warmup_offline_sac:
        for i in range(args.warmup_offline_sac):
            agent.update_sac_only(replay_buffer, L, step=0)

    if args.bc_only:
        step = 0
        for i in range(100):
            agent.train_bc(replay_buffer)
            step += 1
        eval_and_save()
        return

    time_computing = 0
    time_acting = 0
    callback_fn = None
    step = 0

    if args.synch_update:
        callback_fn = lambda: lambda: [
            agent.update(replay_buffer,
                         L,
                         step,
                         log_networks=nu == 0 and step % args.log_networks_freq
                         == 0) for nu in range(args.num_updates)
        ] if step >= args.init_steps and not is_eval else 0  # pointers should all work properly, and execute in the proper frame

    if callback_fn is not None:
        env.env._env.env.set_callback(
            callback_fn)  # envwrapper (camera), framestack, timelimit

    # for step in range(args.num_train_steps):
    while step < args.num_train_steps:

        # evaluate agent periodically
        if step % args.eval_freq == 0:
            is_eval = True
            eval_and_save()
            is_eval = False

        if done:
            if step > 0:
                if step % args.log_interval == 0:
                    L.log('train/duration', time.time() - start_time, step)
                    L.dump(step)
                start_time = time.time()
            if step % args.log_interval == 0:
                L.log('train/episode_reward', episode_reward, step)

            time_tmp = time.time()
            obs = env.reset()
            time_acting += time.time() - time_tmp
            episode_reward = 0
            episode_step = 0
            episode += 1
            if step % args.log_interval == 0:
                L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)
        if step == args.init_steps and args.demo_samples == 0:
            if args.warmup_cpc:
                for i in range(args.warmup_cpc):
                    print("Warming up cpc for " + str(args.warmup_cpc) +
                          ' steps.')
                    agent.update_cpc_only(replay_buffer, L, step=0)
                    print('Warmed up cpc.')

        # run training update
        time_tmp = time.time()

        if step >= args.init_steps and not args.synch_update:
            for nu in range(args.num_updates):
                agent.update(replay_buffer,
                             L,
                             step,
                             log_networks=nu == 0
                             and step % args.log_networks_freq == 0)

        time_computing += time.time() - time_tmp

        time_tmp = time.time()

        next_obs, reward, done, _ = env.step(action)
        time_acting += time.time() - time_tmp

        # allow infinite bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward

        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
        step += 1

    step = args.num_train_steps
    print("time spent computing:", time_computing)
    print("time spent acting:", time_acting)
    eval_and_save()
Exemplo n.º 11
0
def main():
    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    utils.set_seed_everywhere(args.seed)

    gibson_config_filename = os.path.join(
        os.path.dirname(gibson2.__file__),
        '../examples/configs/hand_drawer.yaml')
    env = HandDrawerEnv(config_file=gibson_config_filename, mode='headless')

    env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    env_name = args.domain_name + '-' + args.task_name
    exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b'  \
    + str(args.batch_size) + '-s' + str(args.seed)  + '-' + args.encoder_type
    args.work_dir = args.work_dir + '/' + exp_name

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape

    if args.encoder_type == 'pixel':
        obs_shape = (2 * args.frame_stack, args.image_size, args.image_size)
        pre_aug_obs_shape = (2 * args.frame_stack,
                             args.pre_transform_image_size,
                             args.pre_transform_image_size)
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
    )

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
Exemplo n.º 12
0
def main():
    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    utils.set_seed_everywhere(args.seed)

    goal_env = SRLEnv(args.action_repeat, args.environment, args.srl_model,
                      args.pre_transform_image_size,
                      args.pre_transform_image_size, args.renders,
                      args.is_discrete, args.force_down)

    goal_env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        goal_env = utils.FrameStack(goal_env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d-%H:%M:%S", ts)
    env_name = args.environment
    exp_name = env_name + '-' + args.agent + '-' + ts + '-im' + str(args.image_size) +'-b'  \
    + str(args.batch_size) + '-s' + str(args.seed)  + '-' + args.encoder_type
    args.work_dir = args.work_dir + '/' + exp_name

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))
    pre_buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'pre_buffer'))
    csv_dir = utils.make_dir(os.path.join(args.work_dir, 'csv'))
    image_dir = utils.make_dir(os.path.join(args.work_dir, 'image'))

    log_csv = {
        "step": [],
        "mean_reward": [],
        "mean_distance_to_goal": [],
        "std_distance_to_goal": []
    }

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = goal_env.action_space.shape

    if args.encoder_type == 'pixel':
        obs_shape = (3 * args.frame_stack, args.image_size, args.image_size)
        if args.agent == 'sac_ae':
            pre_aug_obs_shape = obs_shape
        else:
            pre_aug_obs_shape = (3 * args.frame_stack,
                                 args.pre_transform_image_size,
                                 args.pre_transform_image_size)
    else:
        obs_shape = goal_env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    if args.reward_type == 'dist':
        success_samples = goal_env.get_goal_image()
        goal_env.close()

        def sample_goal():
            success_sample = random.choice(success_samples)
            if args.encoder_type == 'pixel':
                frames = []
                for _ in range(args.frame_stack):
                    frames.append(success_sample)

                return np.concatenate(frames, axis=0)
            else:
                return success_sample

    env = SRLEnv(args.action_repeat, args.environment, args.srl_model,
                 args.pre_transform_image_size, args.pre_transform_image_size)

    env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
    )

    pre_replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
    )

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    for step in range(args.pre_training_steps):
        # evaluate agent periodically

        if step % args.eval_freq == 0:
            if args.save_model:
                agent.save_curl(model_dir, step)
            if args.save_buffer:
                pre_replay_buffer.save(pre_buffer_dir)

        if done:
            obs = env.reset()
            done = False
            episode_step = 0
            episode += 1

        # sample action for data collection
        action = env.action_space.sample()
        if args.environment == 'kuka':
            action[2] = -abs(action[2])

        # run training update
        if step >= args.init_steps:
            num_updates = 1
            for _ in range(num_updates):
                agent.update(pre_replay_buffer, L, step, enc_train=True)

        next_obs, reward, done, distance = env.step(action)
        goal_obs = sample_goal()
        if args.reward_type == 'dist':
            reward = agent.dist_reward(next_obs, goal_obs)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        pre_replay_buffer.add(obs, action, reward, next_obs, goal_obs,
                              done_bool)

        obs = next_obs
        episode_step += 1

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    for step in range(args.num_train_steps):
        # evaluate agent periodically

        if step % args.eval_freq == 0:
            L.log('eval/episode', episode, step)
            if args.reward_type == 'dist':
                evaluate(env, agent, replay_buffer, video,
                         args.num_eval_episodes, L, csv_dir, log_csv,
                         image_dir, step, args, sample_goal())
            else:
                evaluate(env, agent, replay_buffer, video,
                         args.num_eval_episodes, L, csv_dir, log_csv,
                         image_dir, step, args, None)
            if args.save_model:
                agent.save_curl(model_dir, step)
            if args.save_buffer:
                replay_buffer.save(buffer_dir)

        if done:
            if step > 0:
                if step % args.log_interval == 0:
                    L.log('train/duration', time.time() - start_time, step)
                    L.dump(step)
                start_time = time.time()
            if step % args.log_interval == 0:
                L.log('train/episode_reward', episode_reward, step)

            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1
            goal_obs = sample_goal()
            if step % args.log_interval == 0:
                L.log('train/episode', episode, step)

        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs, goal_obs)

        # run training update
        if step >= args.init_steps:
            num_updates = 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step, enc_train=False)

        next_obs, reward, done, distance = env.step(action)
        if args.reward_type == 'dist':
            reward = agent.dist_reward(next_obs, goal_obs)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward
        replay_buffer.add(obs, action, reward, next_obs, goal_obs, done_bool)

        obs = next_obs
        episode_step += 1
Exemplo n.º 13
0
def main():
    args = parse_args()
    utils.set_seed_everywhere(args.seed)

    if args.domain_name == 'carla':
        env = CarlaEnv(
            render_display=args.render,  # for local debugging only
            display_text=args.render,  # for local debugging only
            changing_weather_speed=0.1,  # [0, +inf)
            rl_image_size=args.image_size,
            max_episode_steps=1000,
            frame_skip=args.action_repeat,
            is_other_cars=True,
            port=args.port)
        # TODO: implement env.seed(args.seed) ?

        eval_env = env
    else:
        env = dmc2gym.make(domain_name=args.domain_name,
                           task_name=args.task_name,
                           resource_files=args.resource_files,
                           img_source=args.img_source,
                           total_frames=args.total_frames,
                           seed=args.seed,
                           visualize_reward=False,
                           from_pixels=(args.encoder_type == 'pixel'),
                           height=args.image_size,
                           width=args.image_size,
                           frame_skip=args.action_repeat)
        env.seed(args.seed)

        eval_env = dmc2gym.make(domain_name=args.domain_name,
                                task_name=args.task_name,
                                resource_files=args.eval_resource_files,
                                img_source=args.img_source,
                                total_frames=args.total_frames,
                                seed=args.seed,
                                visualize_reward=False,
                                from_pixels=(args.encoder_type == 'pixel'),
                                height=args.image_size,
                                width=args.image_size,
                                frame_skip=args.action_repeat)

    # stack several consecutive frames together
    if args.encoder_type.startswith('pixel'):
        env = utils.FrameStack(env, k=args.frame_stack)
        eval_env = utils.FrameStack(eval_env, k=args.frame_stack)

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # the dmc2gym wrapper standardizes actions
    assert env.action_space.low.min() >= -1
    assert env.action_space.high.max() <= 1

    replay_buffer = utils.ReplayBuffer(obs_shape=env.observation_space.shape,
                                       action_shape=env.action_space.shape,
                                       capacity=args.replay_buffer_capacity,
                                       batch_size=args.batch_size,
                                       device=device)

    agent = make_agent(obs_shape=env.observation_space.shape,
                       action_shape=env.action_space.shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()
    for step in range(args.num_train_steps):
        if done:
            if args.decoder_type == 'inverse':
                for i in range(
                        1, args.k):  # fill k_obs with 0s if episode is done
                    replay_buffer.k_obses[replay_buffer.idx - i] = 0
            if step > 0:
                L.log('train/duration', time.time() - start_time, step)
                start_time = time.time()
                L.dump(step)

            # evaluate agent periodically
            if episode % args.eval_freq == 0:
                L.log('eval/episode', episode, step)
                evaluate(eval_env, agent, video, args.num_eval_episodes, L,
                         step)
                if args.save_model:
                    agent.save(model_dir, step)
                if args.save_buffer:
                    replay_buffer.save(buffer_dir)

            L.log('train/episode_reward', episode_reward, step)

            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1
            reward = 0

            L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)

        # run training update
        if step >= args.init_steps:
            num_updates = args.init_steps if step == args.init_steps else 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step)

        curr_reward = reward
        next_obs, reward, done, _ = env.step(action)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward

        replay_buffer.add(obs, action, curr_reward, reward, next_obs,
                          done_bool)
        np.copyto(replay_buffer.k_obses[replay_buffer.idx - args.k], next_obs)

        obs = next_obs
        episode_step += 1
Exemplo n.º 14
0
def main():
    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    utils.set_seed_everywhere(args.seed)

    pre_transform_image_size = args.pre_transform_image_size if 'crop' in args.data_augs else args.image_size
    pre_image_size = args.pre_transform_image_size  # record the pre transform image size for translation

    env = dmc2gym.make(domain_name=args.domain_name,
                       task_name=args.task_name,
                       seed=args.seed,
                       visualize_reward=False,
                       from_pixels=(args.encoder_type == 'pixel'),
                       height=pre_transform_image_size,
                       width=pre_transform_image_size,
                       frame_skip=args.action_repeat)

    env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    env_name = args.domain_name + '-' + args.task_name
    # exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b'  \
    # + str(args.batch_size) + '-s' + str(args.seed)  + '-' + args.encoder_type
    # args.work_dir = args.work_dir + '/'  + exp_name

    # modded for checking augmentation and corruption
    if args.augmix:
        aug_name = "augs-augmix"
    else:
        aug_name = "augs-" + args.data_augs

    exp_name = env_name + '-' + aug_name + '-s' + str(args.seed)
    args.work_dir = args.work_dir + '/' + exp_name

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))
    eval_dir = utils.make_dir(os.path.join(args.work_dir, 'eval'))

    video = VideoRecorder(video_dir if args.save_video else None)

    print("Args:")
    print(args)
    print("work dir:", args.work_dir)
    os.makedirs(args.work_dir, exist_ok=True)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape

    if args.encoder_type == 'pixel':
        obs_shape = (3 * args.frame_stack, args.image_size, args.image_size)
        pre_aug_obs_shape = (3 * args.frame_stack, pre_transform_image_size,
                             pre_transform_image_size)
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
        pre_image_size=pre_image_size,
    )

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    results = []
    best_mean_reward = 0
    for step in range(args.num_train_steps):
        # evaluate agent periodically

        if step % args.eval_freq == 0:
            agent.load(model_dir, step, map_to_cpu=True)

            L.log('eval/episode', episode, step)
            # evaluate(env, agent, video, args.num_eval_episodes, L, step,args)
            res = evaluate_corruptions(env, agent, video,
                                       args.num_eval_episodes, L, step, args)

            if best_mean_reward < res['mean_ep_reward']:
                best_mean_reward = res['mean_ep_reward']

            res['best_overall'] = best_mean_reward
            results.append(res)

    print(f"Best Mean overall:{best_mean_reward:.4f}")

    import pickle
    results_fname = os.path.join(eval_dir,
                                 f"{args.cor_func}{args.cor_sev}.pkl")
    pickle.dump(results, open(results_fname, "wb"))
Exemplo n.º 15
0
def main():
    args = parse_args()

    dm_envs = {
        'finger': ['finger', 'spin'],
        'cartpole': ['cartpole', 'swingup'],
        'reacher': ['reacher', 'easy'],
        'cheetah': ['cheetah', 'run'],
        'walker': ['walker', 'walk'],
        'ball': ['ball_in_cup', 'catch'],
        'humanoid': ['humanoid', 'stand'],
        'bring_ball': ['manipulator', 'bring_ball'],
        'bring_peg': ['manipulator', 'bring_peg'],
        'insert_ball': ['manipulator', 'insert_ball'],
        'insert_peg': ['manipulator', 'insert_peg'],
    }

    if args.env == 'cartpole':
        args.action_repeat = 8
    elif args.env in ['finger', 'walker']:
        args.action_repeat = 2
    else:
        args.action_repeat = 4

    args.domain_name, args.task_name = dm_envs[args.env]

    global logger
    logger = wandb.init(
        project='d2rl',
        config=args,
        dir='wandb_logs',
        group='{}_{}'.format(args.env),
    )

    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    utils.set_seed_everywhere(args.seed)
    env = dmc2gym.make(domain_name=args.domain_name,
                       task_name=args.task_name,
                       seed=args.seed,
                       visualize_reward=False,
                       from_pixels=(args.encoder_type == 'pixel'),
                       height=args.pre_transform_image_size,
                       width=args.pre_transform_image_size,
                       frame_skip=args.action_repeat)

    env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    env_name = args.domain_name + '-' + args.task_name
    exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b'  \
    + str(args.batch_size) + '-s' + str(args.seed)  + '-' + args.encoder_type

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape

    if args.encoder_type == 'pixel':
        obs_shape = (3 * args.frame_stack, args.image_size, args.image_size)
        pre_aug_obs_shape = (3 * args.frame_stack,
                             args.pre_transform_image_size,
                             args.pre_transform_image_size)
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
    )

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    for step in range(args.num_train_steps):
        # evaluate agent periodically

        if step % args.eval_freq == 0:
            evaluate(env, agent, args.num_eval_episodes, step, args)
            if args.save_model:
                agent.save_curl(model_dir, step)
            if args.save_buffer:
                replay_buffer.save(buffer_dir)

        if done:
            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)

        # run training update
        if step >= args.init_steps:
            num_updates = 1
            for _ in range(num_updates):
                agent.update(replay_buffer, step)

        next_obs, reward, done, _ = env.step(action)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward
        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
Exemplo n.º 16
0
def main():
    args = parse_args()
    if args.seed == -1: 
        args.__dict__["seed"] = np.random.randint(1,1000000)
    utils.set_seed_everywhere(args.seed)

    pre_transform_image_size = args.pre_transform_image_size if 'crop' in args.data_augs else args.image_size

    env = dmc2gym.make(
        domain_name=args.domain_name,
        task_name=args.task_name,
        seed=args.seed,
        visualize_reward=False,
        from_pixels=(args.encoder_type == 'pixel'),
        height=pre_transform_image_size,
        width=pre_transform_image_size,
        frame_skip=args.action_repeat
    )
 
    env.seed(args.seed)
    project_name=args.domain_name+args.task_name
    group_name=""+str(args.replay_buffer_capacity//1000)+"k"+str(args.steps_until_freeze//1000)+"k"+str(args.num_copies)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)
    
    # make directory
    # ts = time.gmtime() 
    # ts = time.strftime("%m-%d", ts)    
    # env_name = args.domain_name + '-' + args.task_name
    # exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b'  \
    # + str(args.batch_size) + '-s' + str(args.seed)  + '-' + args.encoder_type
    original_work_dir = args.work_dir
    exp_name = project_name + "-" + group_name + "-s" + str(args.seed)
    args.work_dir = args.work_dir + '/'  + exp_name

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape

    if args.encoder_type == 'pixel':
        obs_shape = (3*args.frame_stack, args.image_size, args.image_size)
        pre_aug_obs_shape = (3*args.frame_stack,pre_transform_image_size,pre_transform_image_size)
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
    )
    
    p = 3 * pre_aug_obs_shape[1] * pre_aug_obs_shape[2]
    l = args.encoder_feature_dim
    c_prime = min(args.num_train_steps, int(np.floor(args.replay_buffer_capacity * p / l / 4 / 2 / args.num_copies)))
    print('If frozen replay capacity will increase to ', c_prime)

    latent_buffer_critic = utils.ReplayBuffer(
        obs_shape=(args.encoder_feature_dim, 1),
        action_shape=action_shape,
        capacity=c_prime,
        batch_size=args.batch_size,
        device=device,
        is_latent=True,
        num_copies=args.num_copies
    )

    latent_buffer_actor = utils.ReplayBuffer(
        obs_shape=(args.encoder_feature_dim, 1),
        action_shape=action_shape,
        capacity=c_prime,
        batch_size=args.batch_size,
        device=device,
        is_latent=True,
        num_copies=args.num_copies
    )

    agent = make_agent(
        obs_shape=obs_shape,
        action_shape=action_shape,
        args=args,
        device=device
    )

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    def get_cropped_obs_batch(obs, next_obs):
        obs = obs.astype(np.uint8)
        next_obs = next_obs.astype(np.uint8)
        cpu_obs_tmp = utils.random_crop(obs, args.image_size)
        obs_tmp = torch.as_tensor(cpu_obs_tmp, device=device).float()
        cpu_next_obs_tmp = utils.random_crop(next_obs, args.image_size)
        next_obs_tmp = torch.as_tensor(cpu_next_obs_tmp, device=device).float()
        return obs_tmp / 255, next_obs_tmp / 255

    def get_latent_obs(network, obses, next_obses):
        network.encoder(obses)
        conv4_obses = network.encoder.outputs['conv4']
        latent_obses = network.encoder.outputs['fc']
        network.encoder(next_obses)
        conv4_next_obses = network.encoder.outputs['conv4']
        latent_next_obses = network.encoder.outputs['fc']
        return latent_obses, latent_next_obses, conv4_obses, conv4_next_obses

    def move_ac_rew_nd(replay_buffer, buffers, num_transitions):
        for buffer in buffers:
            buffer.actions[:num_transitions] = replay_buffer.actions[:num_transitions]
            buffer.rewards[:num_transitions] = replay_buffer.rewards[:num_transitions]
            buffer.not_dones[:num_transitions] = replay_buffer.not_dones[:num_transitions]

    def move_imgs_to_latent(replay_buffer, buffers, networks, tmp_batch_size, num_transitions):
        k = 0
        # move in batches to avoid cuda out of memory
        while k * tmp_batch_size < num_transitions:
            start = k * tmp_batch_size
            end = min((k + 1) * tmp_batch_size, num_transitions)
            # repeat num_copies times along batch dimension to get different crops
            raw_obses_repeated = np.repeat(replay_buffer.obses[start:end], args.num_copies, axis=0)
            raw_next_obses_repeated = np.repeat(replay_buffer.next_obses[start:end], args.num_copies, axis=0)
            tmp_obses, tmp_next_obses = get_cropped_obs_batch(raw_obses_repeated, raw_next_obses_repeated)

            conv4_obses, conv4_next_obses = None, None
            for i in range(len(buffers)):
                network, buffer = networks[i], buffers[i]
                # for the actor network we only need to run the fc layer, so use previous conv4_obses from critic network
                # (the networks are tied at their convolutional layers)
                if conv4_obses is not None:
                    latent_obses, latent_next_obses, _, _ = get_latent_obs(network, conv4_obses, conv4_next_obses)
                else:
                    latent_obses, latent_next_obses, conv4_obses, conv4_next_obses = get_latent_obs(network, tmp_obses, tmp_next_obses)
                latent_obses = latent_obses.detach().cpu().numpy()
                latent_next_obses = latent_next_obses.detach().cpu().numpy()
                # store args.num_copies random crops for each observation in the current batch
                buffer.obses[start:end] = latent_obses.reshape((end - start, args.num_copies, args.encoder_feature_dim, 1))
                buffer.next_obses[start:end] = latent_next_obses.reshape((end - start, args.num_copies, args.encoder_feature_dim, 1))
                # set buffer.idx and buffer.full appropriately (handles case where buffer.capacity > replay_buffer.capacity)
                buffer.idx = max(replay_buffer.idx, num_transitions)
                buffer.full = num_transitions >= buffer.capacity
            k += 1

    for step in range(args.num_train_steps):
        # evaluate agent periodically

        if step % args.eval_freq == 0:
            L.log('eval/episode', episode, step)
            
            evaluate(env, agent, video, args.num_eval_episodes, L, step,args)
            if args.save_model:
                agent.save_curl(model_dir, step)
                agent.save(model_dir, step)
            if args.save_buffer:
                buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer' + str(step)))
                replay_buffer.save(buffer_dir)

        if done:
            if step > 0:
                if step % args.log_interval == 0:
                    L.log('train/duration', time.time() - start_time, step)
                    L.dump(step)
                start_time = time.time()
            if step % args.log_interval == 0:
                L.log('train/episode_reward', episode_reward, step)

            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1
            if step % args.log_interval == 0:
                L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs / 255.)

        # run training update
        if step >= args.init_steps:
            num_updates = 1 
            for _ in range(num_updates):
                if step < args.steps_until_freeze:
                    agent.update(replay_buffer, L, step, detach_fc=False)
                elif step == args.steps_until_freeze:
                    print("detaching fc layer")
                    agent.critic.encoder.detach_fc = True
                    agent.critic_target.encoder.detach_fc = True
                    agent.actor.encoder.detach_fc = True

                    num_transitions = min(step, replay_buffer.capacity)

                    utils.soft_update_params(agent.critic, agent.critic_target, 1) # set critic_target params to critic params
                    with torch.no_grad():
                        networks = [agent.critic, agent.actor]
                        buffers = [latent_buffer_critic, latent_buffer_actor]
                        # move actions, rewards, and not_dones to latent buffers
                        move_ac_rew_nd(replay_buffer, buffers, num_transitions)
                        # move obs and next_obs to latent buffers
                        move_imgs_to_latent(replay_buffer, buffers, networks, 100, num_transitions)

                    agent.update_with_latent(latent_buffer_critic, latent_buffer_actor, L, step)
                else:
                    agent.update_with_latent(latent_buffer_critic, latent_buffer_actor, L, step)
        next_obs, reward, done, _ = env.step(action)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done
        )
        episode_reward += reward
        if step <= args.steps_until_freeze:
            replay_buffer.add(obs, action, reward, next_obs, done_bool)
        else: # add to latent buffers
            # similar to the "elif step == args.steps_until_freeze" procedure
            raw_obs_repeated = np.repeat(np.expand_dims(obs, axis=0), args.num_copies, axis=0)
            raw_next_obs_repeated = np.repeat(np.expand_dims(next_obs, axis=0), args.num_copies, axis=0)
            obs_tmp, next_obs_tmp = get_cropped_obs_batch(raw_obs_repeated, raw_next_obs_repeated)
            networks = [agent.critic, agent.actor]
            buffers = [latent_buffer_critic, latent_buffer_actor]

            conv4_obs, conv4_next_obs = None, None
            for i in range(len(buffers)):
                network, buffer = networks[i], buffers[i]
                if conv4_obs is not None:
                    latent_obs, latent_next_obs, _, _ = get_latent_obs(network, conv4_obs, conv4_next_obs)
                else:
                    latent_obs, latent_next_obs, conv4_obs, conv4_next_obs = get_latent_obs(network, obs_tmp, next_obs_tmp)
                latent_obs = latent_obs.unsqueeze(-1).detach().cpu().numpy()
                latent_next_obs = latent_next_obs.unsqueeze(-1).detach().cpu().numpy()
                buffer.add(latent_obs, action, reward, latent_next_obs, done_bool)
        obs = next_obs
        episode_step += 1
Exemplo n.º 17
0
def main():
    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    assert (
        args.init_steps == args.batch_size
        and args.num_train_steps * args.action_repeat in [100000, 500000]
    )
    utils.set_seed_everywhere(args.seed)
    env = dmc2gym.make(
        domain_name=args.domain_name,
        task_name=args.task_name,
        seed=args.seed,
        visualize_reward=False,
        from_pixels=(args.encoder_type == "pixel"),
        height=args.pre_transform_image_size,
        width=args.pre_transform_image_size,
        frame_skip=args.action_repeat,
    )

    env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == "pixel":
        env = utils.FrameStack(env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    env_name = args.domain_name + "-" + args.task_name
    exp_name = (
        env_name
        + "-"
        + ts
        + "-im"
        + str(args.image_size)
        + "-b"
        + str(args.batch_size)
        + "-nes"
        + str(args.num_train_steps * args.action_repeat)
        + "-s"
        + str(args.seed)
        + "-"
        + args.encoder_type
        + "-"
        + args.agent
    )
    args.work_dir = args.work_dir + "/" + exp_name

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, "video"))
    model_dir = utils.make_dir(os.path.join(args.work_dir, "model"))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, "buffer"))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, "args.json"), "w") as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    action_shape = env.action_space.shape

    if args.encoder_type == "pixel":
        obs_shape = (3 * args.frame_stack, args.image_size, args.image_size)
        pre_aug_obs_shape = (
            3 * args.frame_stack,
            args.pre_transform_image_size,
            args.pre_transform_image_size,
        )
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        # capacity=args.replay_buffer_capacity,
        capacity=args.batch_size,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
        center_crop_anchor=args.center_crop_anchor,
    )

    agent = make_agent(
        obs_shape=obs_shape, action_shape=action_shape, args=args, device=device
    )

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()
    reward_window = deque([], maxlen=5)
    best_reward = 0.0
    best_step = 0

    for step in range(args.num_train_steps):
        # evaluate agent periodically
        # if step % args.eval_freq == 0:
        #     L.log("eval/episode", episode, step)
        #     with utils.eval_mode(agent):
        #         evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
        #     if args.save_model:
        #         agent.save_curl(model_dir, step)

        if done:
            if step > 0:
                # if step % args.log_interval == 0:
                L.log("train/duration", time.time() - start_time, step)
                L.log("train/episode_reward", episode_reward, step)
                L.dump(step)
                start_time = time.time()
            reward_window.append(episode_reward)
            if len(reward_window) == reward_window.maxlen:
                mean_reward = np.mean(reward_window)
                if mean_reward > best_reward:
                    best_reward = mean_reward
                    best_step = step
                    agent.save(model_dir, best_step)

            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1
            # if step % args.log_interval == 0:
            L.log("train/episode", episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)

        # run training update
        if step >= args.init_steps:
            num_updates = 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step)

        next_obs, reward, done, _ = env.step(action)

        # allow infinite bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(done)
        episode_reward += reward
        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1

    L.log("eval/episode", episode, step)
    agent.load(model_dir, best_step)
    with utils.eval_mode(agent):
        evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
Exemplo n.º 18
0
def main():
    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    utils.set_seed_everywhere(args.seed)
    env = Env()

    # stack several consecutive frames together
    env = utils.FrameStack(env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    env_name = args.domain_name + '-' + args.task_name
    exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b'  \
    + str(args.batch_size) + '-s' + str(args.seed)  + '-' + args.encoder_type
    args.work_dir = args.work_dir + '/' + exp_name

    utils.make_dir(args.work_dir)
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)
    torch.cuda.current_device()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.ACTION_SPACE_SIZE

    obs_shape = (3 * args.frame_stack, args.image_size, args.image_size)
    pre_aug_obs_shape = (3 * args.frame_stack, args.pre_transform_image_size,
                         args.pre_transform_image_size)
    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
    )

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device)
    #agent.load("D:/curl-master/fps-train-09-29-im84-b64-s829604-pixel/model", 2000)
    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    for step in range(args.num_train_steps):
        pre_time = time.time()
        # evaluate agent periodically

        if step % args.eval_freq == 0 and step != 0:
            L.log('eval/episode', episode, step)
            evaluate(env, agent, args.num_eval_episodes, L, step, args)
            if args.save_model:
                agent.save_curl(model_dir, step)
                agent.save(model_dir, step)
            if args.save_buffer:
                replay_buffer.save(buffer_dir)

        if done:
            if step > 0:
                if step % args.log_interval == 0:
                    L.log('train/duration', time.time() - start_time, step)
                    L.dump(step)
                start_time = time.time()
            if step % args.log_interval == 0:
                L.log('train/episode_reward', episode_reward, step)

            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1
            if step % args.log_interval == 0:
                L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            print("random")
            action = random.randint(0, 11)
        else:
            with utils.eval_mode(agent):
                print("random 2")
                action = random.randint(0, 11)

        # run training update
        if step >= args.init_steps:
            num_updates = 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step)
        print("Agent selected 2")
        next_obs, reward, done = env.step(action)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == 500000000 else float(done)
        episode_reward += reward
        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
        time_used = time.time() - pre_time
        print(time_used)
        print("FPS:")
        fps = 1 / time_used
        print(fps)