def _setup(self, config):
        """
        config contains:
            gpu_id (int) -- (default 0)
            use_gpu (bool) --
            init_algo_algo_functions_and_log_fnames ((function, str)[]) -- each
                element of this list is a tuple of a function that returns the
                next algorithm to train and the corresponding log filename.
            init_algo_kwargs (dict) -- the variant to pass into the
                init_algo_function call. This dict is the same for all
                init_algo_function calls. This also means that modification
                to the variant will propagate to future init_algo_function calls.
        """
        gpu_id = config.get('gpu_id', 0)
        use_gpu = config['use_gpu']
        set_gpu_mode(use_gpu, gpu_id)
        logging.info('Using GPU mode={}'.format(use_gpu))
        # import torch
        # if 'cpu' in config['resources_per_trial']:
            # num_threads = config['resources_per_trial']['cpu']
            # torch.set_num_threads(num_threads)
            # logging.info('Setting {} CPU threads'.format(num_threads))

        self.init_algo_functions_and_log_fnames = config['init_algo_functions_and_log_fnames']
        self.init_algo_functions = [
            init_func for init_func, _ in self.init_algo_functions_and_log_fnames
        ]
        self.log_fnames = [
            log_fname for _, log_fname in self.init_algo_functions_and_log_fnames
        ]
        self.init_algo_kwargs = config['algo_variant']
        self.cur_algo = None
        self.cur_algo_idx = -1
        self._setup_next_algo()
Пример #2
0
def simulate_policy(args):
    ptu.set_gpu_mode(True)
    model = pickle.load(open(args.file, "rb"))  # joblib.load(args.file)
    model.to(ptu.device)
    imgs = np.load(args.imgfile)
    import ipdb
    ipdb.set_trace()
    z = model.encode(ptu.np_to_var(imgs))
    samples = model.decode(z).cpu()

    recon_imgs = samples.data.view(64, model.input_channels, model.imsize,
                                   model.imsize)
    recon_imgs = recon_imgs.cpu()
    grid = make_grid(recon_imgs, nrow=8)
    ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy()
    im = Image.fromarray(ndarr)
    im.show()
    # cv2.imshow('img', im)
    # cv2.waitKey(1)
    # for sample in samples:
    #     tensor = tensor.cpu()
    #     img = ptu.get_numpy(tensor)
    comparison = torch.cat([
        recon_imgs,
        imgs,
    ])
    save_dir = osp.join(logger.get_snapshot_dir(), 'r%d.png' % epoch)
    save_image(comparison.data.cpu(), save_dir, nrow=n)
Пример #3
0
def experiment(variant):
    if variant['multitask']:
        env = MultitaskFullVAEPoint2DEnv(
            **variant['env_kwargs'])  # used point2d-conv-sweep/run1/id4
        env = MultitaskToFlatEnv(env)
    # else:
    # env = Pusher2DEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    if variant["use_gpu"]:
        gpu_id = variant["gpu_id"]
        ptu.set_gpu_mode(True)
        ptu.set_device(gpu_id)
        algorithm.to(ptu.device)
        env._wrapped_env.vae.to(ptu.device)
    algorithm.train()
def make_video(args):
    if args.pause:
        import ipdb
        ipdb.set_trace()
    data = pickle.load(open(args.file, "rb"))  # joblib.load(args.file)
    if 'policy' in data:
        policy = data['policy']
    elif 'evaluation/policy' in data:
        policy = data['evaluation/policy']
    else:
        raise AttributeError

    if 'env' in data:
        env = data['env']
    elif 'evaluation/env' in data:
        env = data['evaluation/env']
    else:
        raise AttributeError

    if isinstance(env, RemoteRolloutEnv):
        env = env._wrapped_env
    print("Policy loaded")
    if args.gpu:
        ptu.set_gpu_mode(True)
        policy.to(ptu.device)
    else:
        ptu.set_gpu_mode(False)
        policy.to(ptu.device)
    if isinstance(env, VAEWrappedEnv):
        env.mode(args.mode)

    max_path_length = 100
    observation_key = 'latent_observation'
    desired_goal_key = 'latent_desired_goal'
    rollout_function = rf.create_rollout_function(
        rf.multitask_rollout,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
    )
    env.mode(env._mode_map['video_env'])
    random_id = str(uuid.uuid4()).split('-')[0]
    dump_video(
        env,
        policy,
        'rollouts_{}.mp4'.format(random_id),
        rollout_function,
        rows=3,
        columns=6,
        pad_length=0,
        pad_color=255,
        do_timer=True,
        horizon=max_path_length,
        dirname_to_save_images=None,
        subdirname="rollouts",
        imsize=48,
    )
Пример #5
0
def simulate_policy(args):
    data = pickle.load(open(args.file, "rb"))
    policy_key = args.policy_type + '/policy'
    if policy_key in data:
        policy = data[policy_key]
    else:
        raise Exception("No policy found in loaded dict. Keys: {}".format(
            data.keys()))

    env_key = args.env_type + '/env'
    if env_key in data:
        env = data[env_key]
    else:
        raise Exception("No environment found in loaded dict. Keys: {}".format(
            data.keys()))

    #robosuite env specific things
    env._wrapped_env.has_renderer = True
    env.reset()
    env.viewer.set_camera(camera_id=0)

    if isinstance(env, RemoteRolloutEnv):
        env = env._wrapped_env
    print("Policy loaded")

    if args.enable_render:
        # some environments need to be reconfigured for visualization
        env.enable_render()
    if args.gpu:
        ptu.set_gpu_mode(True)
    if hasattr(policy, "to"):
        policy.to(ptu.device)
    if hasattr(env, "vae"):
        env.vae.to(ptu.device)

    if args.pause:
        import ipdb
        ipdb.set_trace()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    paths = []
    while True:
        paths.append(
            deprecated_rollout(
                env,
                policy,
                max_path_length=args.H,
                render=not args.hide,
            ))
        if args.log_diagnostics:
            if hasattr(env, "log_diagnostics"):
                env.log_diagnostics(paths, logger)
            for k, v in eval_util.get_generic_path_information(paths).items():
                logger.record_tabular(k, v)
            logger.dump_tabular()
def experiment(variant):

    ptu.set_gpu_mode(True, 0)

    imsize = variant['imsize']

    env = ImageForkReacher2dEnv(variant["arm_goal_distance_cost_coeff"],
                                variant["arm_object_distance_cost_coeff"],
                                [imsize, imsize, 3],
                                goal_object_distance_cost_coeff=variant[
                                    "goal_object_distance_cost_coeff"],
                                ctrl_cost_coeff=variant["ctrl_cost_coeff"])

    partial_obs_size = env.obs_dim - imsize * imsize * 3
    print("partial dim was " + str(partial_obs_size))
    env = NormalizedBoxEnv(env)

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    qf1 = MergedCNN(input_width=imsize,
                    input_height=imsize,
                    output_size=1,
                    input_channels=3,
                    added_fc_input_size=action_dim,
                    **variant['cnn_params'])

    qf2 = MergedCNN(input_width=imsize,
                    input_height=imsize,
                    output_size=1,
                    input_channels=3,
                    added_fc_input_size=action_dim,
                    **variant['cnn_params'])

    vf = CNN(input_width=imsize,
             input_height=imsize,
             output_size=1,
             input_channels=3,
             **variant['cnn_params'])

    policy = TanhCNNGaussianPolicy(input_width=imsize,
                                   input_height=imsize,
                                   output_size=action_dim,
                                   input_channels=3,
                                   **variant['cnn_params'])

    algorithm = TwinSAC(env=env,
                        policy=policy,
                        qf1=qf1,
                        qf2=qf2,
                        vf=vf,
                        **variant['algo_params'])

    algorithm.to(ptu.device)
    algorithm.train()
Пример #7
0
def experiment(variant):
    if variant["use_gpu"]:
        ptu.set_gpu_mode(True)
    beta = variant["beta"]
    representation_size = variant["representation_size"]
    m = ConvVAE(representation_size, input_channels=3)
    t = ConvVAETrainer(train_data, test_data, m, beta=beta)

    for epoch in range(1001):
        t.train_epoch(epoch)
        t.test_epoch(epoch)
        t.dump_samples(epoch)
Пример #8
0
def load_path(path, param_path):
    #check if file exists
    if param_path is None or not os.path.exists(param_path) or not os.path.isfile(param_path):
        return
    
    env.reset()

    #load policy
    # torch.load(param_path,map_location='cuda:0')
    data = pickle.load(open(param_path, 'rb'))
    e_ex = False
    if 'epoch' in data:
        e_ex = True
        epoch = data['epoch']
    
    
    use_gpu = True
    gpu_id = '0'
    ptu.set_gpu_mode(use_gpu, gpu_id)
    os.environ['gpu_id'] = str(gpu_id)
    
    policy = data['evaluation/policy'].stochastic_policy
    policy.cuda()
    policy.eval()
    
    #path collector
    eval_path_collector = MdpPathCollector(
        env,
        MakeDeterministic(policy),
        sparse_reward=False,
    )
    paths = eval_path_collector.collect_new_paths(
        max_path_length=250,
        num_steps=1000,
        discard_incomplete_paths=True,
    )

    #calculate average return
    avg_return = 0
    for i in range(len(paths)):
        rewards = paths[i]['rewards']
        cum_rewards = np.cumsum(rewards)
        discounted_rewards = 0.9 ** np.arange(cum_rewards.shape[0])
        discounted_rewards = discounted_rewards * cum_rewards
        avg_return += np.sum(discounted_rewards)
    if e_ex:
        out[path].append((epoch, avg_return/len(paths)))
    else:
        out[path].append(avg_return/len(paths))
Пример #9
0
def experiment(variant):
    if variant["use_gpu"]:
        gpu_id = variant["gpu_id"]
        ptu.set_gpu_mode(True)
        ptu.set_device(gpu_id)

    beta = variant["beta"]
    representation_size = variant["representation_size"]
    train_data, test_data = get_data(10000)
    m = ConvVAE(representation_size)
    t = ConvVAETrainer(train_data, test_data, m, beta=beta, do_scatterplot=False)
    for epoch in range(101):
        t.train_epoch(epoch)
        t.test_epoch(epoch)
        t.dump_samples(epoch)
Пример #10
0
def experiment(variant):
    if variant["use_gpu"]:
        gpu_id = variant["gpu_id"]
        ptu.set_gpu_mode(True)
        ptu.set_device(gpu_id)

    beta = variant["beta"]
    representation_size = variant["representation_size"]
    train_data, test_data = get_data(10000)
    m = ConvVAE(representation_size, input_channels=3)
    t = ConvVAETrainer(train_data, test_data, m, beta=beta, use_cuda=True)
    for epoch in range(50):
        t.train_epoch(epoch)
        t.test_epoch(epoch)
        t.dump_samples(epoch)
Пример #11
0
def simulate_policy(args):
    if args.pause:
        import ipdb; ipdb.set_trace()
    data = pickle.load(open(args.file, "rb")) # joblib.load(args.file)
    if 'policy' in data:
        policy = data['policy']
    elif 'evaluation/policy' in data:
        policy = data['evaluation/policy']

    if 'env' in data:
        env = data['env']
    elif 'evaluation/env' in data:
        env = data['evaluation/env']

    if isinstance(env, RemoteRolloutEnv):
        env = env._wrapped_env
    print("Policy loaded")
    if args.gpu:
        ptu.set_gpu_mode(True)
        policy.to(ptu.device)
    else:
        ptu.set_gpu_mode(False)
        policy.to(ptu.device)
    if isinstance(env, VAEWrappedEnv):
        env.mode(args.mode)
    if args.enable_render or hasattr(env, 'enable_render'):
        # some environments need to be reconfigured for visualization
        env.enable_render()
    if args.multitaskpause:
        env.pause_on_goal = True
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    paths = []
    while True:
        paths.append(multitask_rollout(
            env,
            policy,
            max_path_length=args.H,
            render=not args.hide,
            observation_key=data.get('evaluation/observation_key', 'observation'),
            desired_goal_key=data.get('evaluation/desired_goal_key', 'desired_goal'),
        ))
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics(paths)
        if hasattr(env, "get_diagnostics"):
            for k, v in env.get_diagnostics(paths).items():
                logger.record_tabular(k, v)
        logger.dump_tabular()
Пример #12
0
def simulate_policy(args):
    dir = args.path
    data = joblib.load("{}/params.pkl".format(dir))
    env = data['env']
    model_params = data['model_params']
    mpc_params = data['mpc_params']
    # dyn_model = NNDynamicsModel(env=env, **model_params)
    # mpc_controller = MPCcontroller(env=env,
    #                                dyn_model=dyn_model,
    #                                **mpc_params)
    tf_path_meta = "{}/tf_out-0.meta".format(dir)
    tf_path = "{}/tf_out-0".format(dir)

    with tf.Session() as sess:
        new_saver = tf.train.import_meta_graph(tf_path_meta)
        new_saver.restore(sess, tf_path)

    env = data['env']
    if isinstance(env, RemoteRolloutEnv):
        env = env._wrapped_env
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.to(ptu.device)
    if args.pause:
        import ipdb
        ipdb.set_trace()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        try:
            path = rollout(
                env,
                policy,
                max_path_length=args.H,
                animated=True,
            )
            env.log_diagnostics([path])
            policy.log_diagnostics([path])
            logger.dump_tabular()
        # Hack for now. Not sure why rollout assumes that close is an
        # keyword argument
        except TypeError as e:
            if (str(e) != "render() got an unexpected keyword "
                    "argument 'close'"):
                raise e
Пример #13
0
def experiment(variant):
    if variant["use_gpu"]:
        gpu_id = variant["gpu_id"]
        ptu.set_gpu_mode(True)
        ptu.set_device(gpu_id)

    beta = variant["beta"]
    representation_size = variant["representation_size"]
    train_data, test_data = get_data(10000)
    m = ConvVAE(representation_size, input_channels=3)
    t = ConvVAETrainer(train_data,
                       test_data,
                       m,
                       beta_schedule=PiecewiseLinearSchedule([0, 400, 800],
                                                             [0.5, 0.5, beta]))
    for epoch in range(1001):
        t.train_epoch(epoch)
        t.test_epoch(epoch)
        t.dump_samples(epoch)
Пример #14
0
def experiment(variant):
    from railrl.core import logger
    import railrl.torch.pytorch_util as ptu
    ptu.set_gpu_mode(True)
    info = dict()
    logger.save_extra_data(info)
    logger.get_snapshot_dir()
    net = CNN(**variant['cnn_kwargs'])
    net.cuda()
    num_divisions = variant['num_divisions']
    images = np.zeros((num_divisions * 10000, 21168))
    states = np.zeros((num_divisions * 10000, 7))
    for i in range(num_divisions):
        imgs = np.load(
            '/home/murtaza/vae_data/sawyer_torque_control_images100000_' +
            str(i + 1) + '.npy')
        state = np.load(
            '/home/murtaza/vae_data/sawyer_torque_control_states100000_' +
            str(i + 1) + '.npy')[:, :7] % (2 * np.pi)
        images[i * 10000:(i + 1) * 10000] = imgs
        states[i * 10000:(i + 1) * 10000] = state
        print(i)
    if variant['normalize']:
        std = np.std(states, axis=0)
        mu = np.mean(states, axis=0)
        states = np.divide((states - mu), std)
        print(mu, std)
    mid = int(num_divisions * 10000 * .9)
    train_images, test_images = images[:mid], images[mid:]
    train_labels, test_labels = states[:mid], states[mid:]

    algo = SupervisedAlgorithm(train_images,
                               test_images,
                               train_labels,
                               test_labels,
                               net,
                               batch_size=variant['batch_size'],
                               lr=variant['lr'],
                               weight_decay=variant['weight_decay'])
    for epoch in range(variant['num_epochs']):
        algo.train_epoch(epoch)
        algo.test_epoch(epoch)
def simulate_policy(args):
    ptu.set_gpu_mode(True)
    model = pickle.load(open(args.file, "rb"))  # joblib.load(args.file)
    model.to(ptu.device)
    import ipdb
    ipdb.set_trace()
    samples = ptu.Variable(torch.randn(64, model.representation_size))
    samples = model.decode(samples).cpu()
    # for sample in samples:
    #     tensor = sample.data.view(64, model.input_channels, model.imsize, model.imsize)
    #     tensor = tensor.cpu()
    #     img = ptu.get_numpy(tensor)
    #     cv2.imshow('img', img.reshape(3, 84, 84).transpose())
    #     cv2.waitKey(1)

    tensor = samples.data.view(64, model.input_channels, model.imsize,
                               model.imsize)
    tensor = tensor.cpu()
    grid = make_grid(tensor, nrow=8)
    ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy()
    im = Image.fromarray(ndarr)
    im.show()
Пример #16
0
def setup_experiment(
    variant,
    exp_name,
    base_log_dir,
    git_infos,
    script_name,
    use_gpu,
    gpu_id,
):
    logger_config = variant.get('logger_config', {})
    seed = variant.get('seed', random.randint(0, 999999))
    exp_id = variant.get('exp_id', random.randint(0, 999999))
    set_seed(seed)
    ptu.set_gpu_mode(use_gpu, gpu_id)
    os.environ['gpu_id'] = str(gpu_id)
    setup_logger(logger,
                 exp_name=exp_name,
                 base_log_dir=base_log_dir,
                 variant=variant,
                 git_infos=git_infos,
                 script_name=script_name,
                 seed=seed,
                 exp_id=exp_id,
                 **logger_config)
def experiment(variant):

    ptu.set_gpu_mode(True, 0)

    from softlearning.environments.gym import register_image_reach
    register_image_reach()
    env = gym.make('Pusher2d-ImageReach-v0', arm_goal_distance_cost_coeff=1.0, arm_object_distance_cost_coeff=0.0)

    #import ipdb; ipdb.set_trace()
    input_width, input_height = env.image_shape

    action_dim = int(np.prod(env.action_space.shape))
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=input_width,
        input_height=input_height,
        input_channels=3,
        added_fc_input_size=4,
        output_conv_channels=True,
        output_size=None,
    )
    if variant['shared_qf_conv']:
        qf_cnn = CNN(**cnn_params)
        qf1 = MlpQfWithObsProcessor(
            obs_processor=qf_cnn,
            output_size=1,
            input_size=action_dim+qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs']
        )
        qf2 = MlpQfWithObsProcessor(
            obs_processor=qf_cnn,
            output_size=1,
            input_size=action_dim+qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs']
        )
        target_qf_cnn = CNN(**cnn_params)
        target_qf1 = MlpQfWithObsProcessor(
            obs_processor=target_qf_cnn,
            output_size=1,
            input_size=action_dim+qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs']
        )
        target_qf2 = MlpQfWithObsProcessor(
            obs_processor=target_qf_cnn,
            output_size=1,
            input_size=action_dim+qf_cnn.conv_output_flat_size,
            **variant['qf_kwargs']
        )
    else:
        qf1_cnn = CNN(**cnn_params)
        cnn_output_dim = qf1_cnn.conv_output_flat_size
        qf1 = MlpQfWithObsProcessor(
            obs_processor=qf1_cnn,
            output_size=1,
            input_size=action_dim+cnn_output_dim,
            **variant['qf_kwargs']
        )
        qf2 = MlpQfWithObsProcessor(
            obs_processor=CNN(**cnn_params),
            output_size=1,
            input_size=action_dim+cnn_output_dim,
            **variant['qf_kwargs']
        )
        target_qf1 = MlpQfWithObsProcessor(
            obs_processor=CNN(**cnn_params),
            output_size=1,
            input_size=action_dim+cnn_output_dim,
            **variant['qf_kwargs']
        )
        target_qf2 = MlpQfWithObsProcessor(
            obs_processor=CNN(**cnn_params),
            output_size=1,
            input_size=action_dim+cnn_output_dim,
            **variant['qf_kwargs']
        )
    action_dim = int(np.prod(env.action_space.shape))
    policy_cnn = CNN(**cnn_params)
    policy = TanhGaussianPolicyAdapter(
        policy_cnn,
        policy_cnn.conv_output_flat_size,
        action_dim,
    )
    eval_env = expl_env = env

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
        **variant['eval_path_collector_kwargs']
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['trainer_kwargs']
    )
    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env,
            policy,
            **variant['expl_path_collector_kwargs']
        )
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs']
        )
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
            **variant['expl_path_collector_kwargs']
        )
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs']
        )
    elif variant['collection_mode'] == 'parallel':
        expl_path_collector = MdpPathCollector(
            expl_env,
            policy,
            **variant['expl_path_collector_kwargs']
        )
        algorithm = TorchParallelRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs']
        )
    algorithm.to(ptu.device)
    algorithm.train()
Пример #18
0
def main():
    ptu.set_gpu_mode(True)

    obs_dim = 1
    action_dim = 1
    batch_size = 100

    model = NAF(obs_dim, action_dim)
    # model = SeparateDuelingFF(obs_dim, action_dim)
    # model = ConcatFF(obs_dim, action_dim)
    # model = OuterProductFF(obs_dim, action_dim)
    version = model.__class__.__name__
    version = "NAF-P-depends-on-embedded"

    optimizer = optim.SGD(model.parameters(), lr=1e-7, momentum=0.5)
    loss_fnct = nn.MSELoss()

    num_batches_per_print = 100
    train_size = 100000
    test_size = 10000

    state_bounds = (-10, 10)
    action_bounds = (-10, 10)
    resolution = 20

    base_dir = Path(
        "/home/vitchyr/git/rllab-rail/railrl/data/one-offs/polynomial-nn")
    base_dir = base_dir / version
    if not base_dir.exists():
        base_dir.mkdir()
    report_path = str(base_dir / "report.html")
    report = HTMLReport(report_path, images_per_row=2)
    print("Saving report to: {}".format(report_path))

    train_loader = data.DataLoader(FakeDataset(obs_dim, action_dim, train_size,
                                               state_bounds, action_bounds),
                                   batch_size=batch_size,
                                   shuffle=True)
    test_loader = data.DataLoader(FakeDataset(obs_dim, action_dim, test_size,
                                              state_bounds, action_bounds),
                                  batch_size=batch_size,
                                  shuffle=True)

    model.to(ptu.device)

    def eval_model(state, action):
        state = ptu.Variable(state, requires_grad=False)
        action = ptu.Variable(action, requires_grad=False)
        a, v = model(state, action)
        return a + v

    def train(epoch):
        for batch_idx, (state, action, q_target) in enumerate(train_loader):
            q_estim = eval_model(state, action)
            q_target = ptu.Variable(q_target, requires_grad=False)

            loss = loss_fnct(q_estim, q_target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch_idx % num_batches_per_print == 0:
                line_logger.print_over(
                    'Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
                        epoch, batch_size * batch_idx, train_size,
                        loss.data[0]))

    def test(epoch):
        test_losses = []
        for state, action, q_target in test_loader:
            q_estim = eval_model(state, action)
            q_target = ptu.Variable(q_target, requires_grad=False)
            loss = loss_fnct(q_estim, q_target)
            test_losses.append(loss.data[0])

        line_logger.newline()
        print('Test Epoch: {0}. Loss: {1}'.format(epoch, np.mean(test_losses)))

        report.add_header("Epoch = {}".format(epoch))

        fig = visualize_model(q_function, "True Q Function")
        img = vu.save_image(fig)
        report.add_image(img, txt='True Q Function')

        fig = visualize_model(eval_model_np, "Estimated Q Function")
        img = vu.save_image(fig)
        report.add_image(img, txt='Estimated Q Function')

        report.new_row()

    def eval_model_np(state, action):
        state = ptu.Variable(ptu.FloatTensor([[state]]), requires_grad=False)
        action = ptu.Variable(ptu.FloatTensor([[action]]), requires_grad=False)
        a, v = model(state, action)
        q = a + v
        return ptu.get_numpy(q)[0]

    def visualize_model(eval, title):
        fig = plt.figure()
        ax = plt.gca()
        heatmap = vu.make_heat_map(
            eval,
            x_bounds=state_bounds,
            y_bounds=action_bounds,
            resolution=resolution,
        )

        vu.plot_heatmap(heatmap, fig, ax)
        ax.set_xlabel("State")
        ax.set_ylabel("Action")
        ax.set_title(title)

        return fig

    for epoch in range(0, 10):
        model.train()
        train(epoch)
        model.eval()
        test(epoch)

    print("Report saved to: {}".format(report_path))
Пример #19
0
def build_env(env_id):
    ptu.set_gpu_mode(True)

    env = RLBenchEnv(
        task_class=OpenDrawer,
        fixed_goal=(),
        headless=False,
        camera=(500, 300),
        state_observation_type="task",
        stub=False,
    )

    env = ImageEnv(env,
        recompute_reward=False,
        transpose=True,
        image_length=450000,
        reward_type="image_distance",
        # init_camera=sawyer_pusher_camera_upright_v2,
    )

    variant = dict(
        model_path="/home/ashvin/data/s3doodad/facebook/models/rfeatures/multitask1/run2/id2/itr_4000.pt",
        desired_trajectory="/home/ashvin/code/railrl-private/gitignore/rlbench/demo_door_fixed2/demos5b_10_dict.npy",
        model_kwargs=dict(
            decoder_distribution='gaussian_identity_variance',
            input_channels=3,
            imsize=224,
            architecture=dict(
                hidden_sizes=[200, 200],
            ),
            delta_features=True,
            pretrained_features=False,
        ),
        reward_params_type="regression_distance",
    )
    model_class = variant.get('model_class', TimestepPredictionModel)
    representation_size = 128
    output_classes = 20
    model = model_class(
        representation_size,
        # decoder_output_activation=decoder_activation,
        output_classes=output_classes,
        **variant['model_kwargs'],
    )
    # model = torch.nn.DataParallel(model)

    model_path = variant.get("model_path")
    # model = load_local_or_remote_file(model_path)
    state_dict = torch.load(model_path)
    model.load_state_dict(state_dict)
    model.to(ptu.device)
    model.eval()

    traj = np.load(variant.get("desired_trajectory"), allow_pickle=True)[0]

    goal_image = traj["observations"][-1]["image_observation"]
    goal_image = goal_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0
    # goal_image = goal_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0 # BECAUSE RLBENCH DEMOS ARENT IMAGE_ENV WRAPPED
    # goal_image = goal_image[:, :, :240, 60:500]
    goal_image = goal_image[:, :, 60:, 60:500]
    goal_image_pt = ptu.from_numpy(goal_image)
    # save_image(goal_image_pt.data.cpu(), 'demos/goal.png', nrow=1)
    goal_latent = model.encode(goal_image_pt).detach().cpu().numpy().flatten()

    initial_image = traj["observations"][0]["image_observation"]
    initial_image = initial_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0
    # initial_image = initial_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0
    # initial_image = initial_image[:, :, :240, 60:500]
    initial_image = initial_image[:, :, 60:, 60:500]
    initial_image_pt = ptu.from_numpy(initial_image)
    # save_image(initial_image_pt.data.cpu(), 'demos/initial.png', nrow=1)
    initial_latent = model.encode(initial_image_pt).detach().cpu().numpy().flatten()

    # Move these to td3_bc and bc_v3 (or at least type for reward_params)
    reward_params = dict(
        goal_latent=goal_latent,
        initial_latent=initial_latent,
        type=variant["reward_params_type"],
    )

    config_params = variant.get("config_params")
    env = EncoderWrappedEnv(
        env,
        model,
        reward_params,
        config_params,
        **variant.get("encoder_wrapped_env_kwargs", dict())
    )
    env = FlatGoalEnv(env, obs_keys=["state_observation", ])
    return env
Пример #20
0
"""
Fine tune a trained policy/qf
"""
import argparse

import joblib

import torch

import railrl.torch.pytorch_util as ptu

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('path',
                        type=str,
                        help='Path to snapshot file to fine tune.')
    args = parser.parse_args()

    ptu.set_gpu_mode(True)

    data = torch.load(args.path, "cuda")
    algo = data['algorithm']
    # algo.to("cpu")
    # algo.to("cuda")
    algo.train()
Пример #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('file', type=str, help='path to the snapshot file')
    parser.add_argument('--H',
                        type=int,
                        default=300,
                        help='Max length of rollout')
    parser.add_argument('--nrolls',
                        type=int,
                        default=1,
                        help='Number of rollout per eval')
    parser.add_argument('--verbose', action='store_true')
    parser.add_argument('--mtau', type=float, help='Max tau value')
    parser.add_argument('--grid', action='store_true')
    parser.add_argument('--gpu', action='store_true')
    parser.add_argument('--load', action='store_true')
    parser.add_argument('--hide', action='store_true')
    parser.add_argument('--pause', action='store_true')
    parser.add_argument('--cycle', help='cycle tau', action='store_true')
    args = parser.parse_args()

    data = joblib.load(args.file)
    env = data['env']
    if 'policy' in data:
        policy = data['policy']
    else:
        policy = data['exploration_policy']
    qf = data['qf']
    policy.train(False)
    qf.train(False)

    if args.pause:
        import ipdb
        ipdb.set_trace()

    if args.gpu:
        ptu.set_gpu_mode(True)
        policy.to(ptu.device)

    if args.mtau is None:
        print("Defaulting max tau to 10.")
        max_tau = 10
    else:
        max_tau = args.mtau

    while True:
        paths = []
        for _ in range(args.nrolls):
            goal = env.sample_goal_for_rollout()
            print("goal", goal)
            env.set_goal(goal)
            policy.set_goal(goal)
            policy.set_tau(max_tau)
            path = rollout(
                env,
                policy,
                qf,
                init_tau=max_tau,
                max_path_length=args.H,
                animated=not args.hide,
                cycle_tau=args.cycle,
            )
            paths.append(path)
        env.log_diagnostics(paths)
        for key, value in get_generic_path_information(paths).items():
            logger.record_tabular(key, value)
        logger.dump_tabular()
Пример #22
0
            add_demo_latents=True,
            bc_num_pretrain_steps=100,
        ),
        replay_buffer_kwargs=dict(
            max_size=100000,
            fraction_goals_rollout_goals=1.0,
            fraction_goals_env_goals=0.0,
        ),
        qf_kwargs=dict(hidden_sizes=[400, 300], ),
        policy_kwargs=dict(hidden_sizes=[400, 300], ),
        save_video=True,
        dump_video_kwargs=dict(save_period=1,
                               # imsize=(3, 500, 300),
                               ))

    ptu.set_gpu_mode("gpu")

    representation_size = 128
    output_classes = 20

    model_class = variant.get('model_class', TimestepPredictionModel)
    model = model_class(
        representation_size,
        # decoder_output_activation=decoder_activation,
        output_classes=output_classes,
        **variant['model_kwargs'],
    )
    # model = torch.nn.DataParallel(model)

    imagenets = [True, False]
    reg_types = ["regression_distance", "latent_distance"]
Пример #23
0
def simulate_policy(args):
    data = joblib.load(args.file)
    if 'eval_policy' in data:
        policy = data['eval_policy']
    elif 'policy' in data:
        policy = data['policy']
    elif 'exploration_policy' in data:
        policy = data['exploration_policy']
    else:
        raise Exception("No policy found in loaded dict. Keys: {}".format(
            data.keys()))

    env = data['env']

    env.mode("video_env")
    env.decode_goals = True

    if hasattr(env, 'enable_render'):
        # some environments need to be reconfigured for visualization
        env.enable_render()

    if args.gpu:
        set_gpu_mode(True)
        policy.to(ptu.device)
        if hasattr(env, "vae"):
            env.vae.to(ptu.device)
    else:
        # make sure everything is on the CPU
        set_gpu_mode(False)
        policy.cpu()
        if hasattr(env, "vae"):
            env.vae.cpu()

    if args.pause:
        import ipdb
        ipdb.set_trace()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    ROWS = 3
    COLUMNS = 6
    dirname = osp.dirname(args.file)
    input_file_name = os.path.splitext(os.path.basename(args.file))[0]
    filename = osp.join(dirname, "video_{}.mp4".format(input_file_name))
    rollout_function = create_rollout_function(
        multitask_rollout,
        observation_key='observation',
        desired_goal_key='desired_goal',
    )
    paths = dump_video(
        env,
        policy,
        filename,
        rollout_function,
        ROWS=ROWS,
        COLUMNS=COLUMNS,
        horizon=args.H,
        dirname_to_save_images=dirname,
        subdirname="rollouts_" + input_file_name,
    )

    if hasattr(env, "log_diagnostics"):
        env.log_diagnostics(paths)
    logger.dump_tabular()
Пример #24
0
def simulate_policy(args):
    data = joblib.load(args.file)
    if 'eval_policy' in data:
        policy = data['eval_policy']
    elif 'policy' in data:
        policy = data['policy']
    elif 'exploration_policy' in data:
        policy = data['exploration_policy']
    elif 'naf_policy' in data:
        policy = data['naf_policy']
    elif 'optimizable_qfunction' in data:
        qf = data['optimizable_qfunction']
        policy = qf.implicit_policy
    else:
        raise Exception("No policy found in loaded dict. Keys: {}".format(
            data.keys()))

    env = data['env']
    if isinstance(env, RemoteRolloutEnv):
        env = env._wrapped_env
    print("Policy loaded")

    env.mode("video_env")
    env.decode_goals = True

    image_env = ImageMujocoEnv(
        env._wrapped_env._wrapped_env,
        84,
        init_camera=None,
        camera_name="topview",
        transpose=True,
        normalize=True,
    )
    # env.image_env = image_env

    if args.enable_render:
        # some environments need to be reconfigured for visualization
        env.enable_render()

    if args.gpu:
        set_gpu_mode(True)
        policy.to(ptu.device)
        if hasattr(env, "vae"):
            env.vae.to(ptu.device)
    else:
        # make sure everything is on the CPU
        set_gpu_mode(False)
        policy.cpu()
        if hasattr(env, "vae"):
            env.vae.cpu()

    if args.pause:
        import ipdb
        ipdb.set_trace()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    ROWS = 3
    COLUMNS = 6
    dirname = osp.dirname(args.file)
    input_file_name = os.path.splitext(os.path.basename(args.file))[0]
    filename = osp.join(dirname, "video_{}.mp4".format(input_file_name))
    paths = dump_video(
        env,
        policy,
        filename,
        ROWS=ROWS,
        COLUMNS=COLUMNS,
        horizon=args.H,
        image_env=image_env,
        dirname=dirname,
        subdirname="rollouts_" + input_file_name,
    )

    if hasattr(env, "log_diagnostics"):
        env.log_diagnostics(paths)
    logger.dump_tabular()
Пример #25
0
def run_experiment_here(
        experiment_function,
        variant=None,
        exp_id=0,
        seed=0,
        use_gpu=True,
        # Logger params:
        exp_prefix="default",
        snapshot_mode='last',
        snapshot_gap=1,
        git_infos=None,
        script_name=None,
        logger=default_logger,
        trial_dir_suffix=None,
        randomize_seed=False,
        **setup_logger_kwargs):
    """
    Run an experiment locally without any serialization.
    :param experiment_function: Function. `variant` will be passed in as its
    only argument.
    :param exp_prefix: Experiment prefix for the save file.
    :param variant: Dictionary passed in to `experiment_function`.
    :param exp_id: Experiment ID. Should be unique across all
    experiments. Note that one experiment may correspond to multiple seeds,.
    :param seed: Seed used for this experiment.
    :param use_gpu: Run with GPU. By default False.
    :param script_name: Name of the running script
    :param log_dir: If set, set the log directory to this. Otherwise,
    the directory will be auto-generated based on the exp_prefix.
    :return:
    """
    if variant is None:
        variant = {}
    variant['exp_id'] = str(exp_id)

    if randomize_seed or (seed is None and 'seed' not in variant):
        seed = random.randint(0, 100000)
        variant['seed'] = str(seed)
    reset_execution_environment(logger=logger)

    actual_log_dir = setup_logger(exp_prefix=exp_prefix,
                                  variant=variant,
                                  exp_id=exp_id,
                                  seed=seed,
                                  snapshot_mode=snapshot_mode,
                                  snapshot_gap=snapshot_gap,
                                  git_infos=git_infos,
                                  script_name=script_name,
                                  logger=logger,
                                  trial_dir_suffix=trial_dir_suffix,
                                  **setup_logger_kwargs)

    set_seed(seed)
    from railrl.torch.pytorch_util import set_gpu_mode
    set_gpu_mode(use_gpu)

    run_experiment_here_kwargs = dict(variant=variant,
                                      exp_id=exp_id,
                                      seed=seed,
                                      use_gpu=use_gpu,
                                      exp_prefix=exp_prefix,
                                      snapshot_mode=snapshot_mode,
                                      snapshot_gap=snapshot_gap,
                                      git_infos=git_infos,
                                      script_name=script_name,
                                      **setup_logger_kwargs)
    save_experiment_data(
        dict(run_experiment_here_kwargs=run_experiment_here_kwargs),
        actual_log_dir)
    return experiment_function(variant)
    parser.add_argument('--num_rollouts',
                        type=int,
                        default=5,
                        help='Number of rollouts per eval')
    parser.add_argument('--discount', type=float, help='Discount Factor')
    parser.add_argument('--gpu', action='store_true')
    parser.add_argument('--hide', action='store_true')
    parser.add_argument('--verbose', action='store_true')
    args = parser.parse_args()

    data = joblib.load(args.file)
    env = data['env']
    print("Environment Type = ", type(env))
    qf = data['qf']
    if args.gpu:
        set_gpu_mode(True)
        qf.to(ptu.device)
    qf.train(False)

    if 'discount' in data:
        discount = data['discount']
        if args.discount is not None:
            print("WARNING: you are overriding the saved discount factor.")
            discount = args.discount
    else:
        discount = args.discount

    num_samples = 1000
    policy = SamplePolicyPartialOptimizer(qf, env, num_samples)

    policy.set_tau(discount)
def experiment(variant):
    rdim = variant["rdim"]
    vae_paths = {
        2:
        "/home/ashvin/data/s3doodad/ashvin/vae/point2d-conv-sweep2/run0/id1/params.pkl",
        4:
        "/home/ashvin/data/s3doodad/ashvin/vae/point2d-conv-sweep2/run0/id4/params.pkl"
    }
    vae_path = vae_paths[rdim]
    vae = joblib.load(vae_path)
    print("loaded", vae_path)

    if variant['multitask']:
        env = MultitaskImagePoint2DEnv(**variant['env_kwargs'])
        env = VAEWrappedEnv(env,
                            vae,
                            use_vae_obs=True,
                            use_vae_reward=False,
                            use_vae_goals=False)
        env = MultitaskToFlatEnv(env)
    # else:
    # env = Pusher2DEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    training_env=env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    print("use_gpu", variant["use_gpu"], bool(variant["use_gpu"]))
    if variant["use_gpu"]:
        gpu_id = variant["gpu_id"]
        ptu.set_gpu_mode(True)
        ptu.set_device(gpu_id)
        algorithm.to(ptu.device)
        env._wrapped_env.vae.to(ptu.device)
    algorithm.train()
Пример #28
0
def experiment(variant):
    rdim = variant["rdim"]
    vae_paths = {
        2:
        "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id0/params.pkl",
        4:
        "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id1/params.pkl",
        8:
        "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id2/params.pkl",
        16:
        "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id3/params.pkl"
    }
    vae_path = vae_paths[rdim]
    vae = torch.load(vae_path)
    print("loaded", vae_path)

    if variant['multitask']:
        env = FullPusher2DEnv(**variant["env_kwargs"])
        env = ImageMujocoEnv(env,
                             84,
                             camera_name="topview",
                             transpose=True,
                             normalize=True)
        env = VAEWrappedImageGoalEnv(env,
                                     vae,
                                     use_vae_obs=True,
                                     use_vae_reward=True,
                                     use_vae_goals=True,
                                     render_goals=True,
                                     render_rollouts=True,
                                     track_qpos_goal=5)
        env = MultitaskToFlatEnv(env)
    # else:
    # env = Pusher2DEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    training_env=env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    print("use_gpu", variant["use_gpu"], bool(variant["use_gpu"]))
    if variant["use_gpu"]:
        gpu_id = variant["gpu_id"]
        ptu.set_gpu_mode(True)
        ptu.set_device(gpu_id)
        algorithm.to(ptu.device)
        env._wrapped_env.vae.to(ptu.device)
    algorithm.train()