Пример #1
0
def onpolicy_inference():
    env = make_vec_envs(
        args.env_name,
        args.seed + 1000,
        1,
        None,
        None,
        device='cuda:0',
        allow_early_resets=False,
        env_kwargs=env_kwargs,
    )
    env_obj = env.venv.venv.envs[0].env.env
    if args.env_name.find('door') <= -1:
        env_obj.unity = None

    render_func = get_render_func(env)
    if evaluation and not render:
        render_func = None

    if env_kwargs['visionnet_input']:
        visionmodel = VisionModelXYZ()
        visionmodel = load_visionmodel(args.load_name, args.visionmodel_path,
                                       VisionModelXYZ())

    actor_critic, ob_rms = torch.load(args.load_name)
    actor_critic = actor_critic.eval()
    if env_kwargs['visionnet_input'] and args.env_name.find('doorenv') > -1:
        actor_critic.visionmodel = visionmodel
        actor_critic.visionnet_input = env_obj.visionnet_input
    actor_critic.to("cuda:0")

    if args.env_name.find('doorenv') > -1:
        actor_critic.nn = env_obj.nn

    recurrent_hidden_states = torch.zeros(
        1, actor_critic.recurrent_hidden_state_size)
    masks = torch.zeros(1, 1)

    knob_noisy = args.knob_noisy

    def add_noise(obs, epoch=100):
        satulation = 100.
        sdv = torch.tensor([
            3.440133806003181, 3.192113342496682, 1.727412865751099
        ]) / satulation  #Vision SDV for arm
        noise = torch.distributions.Normal(torch.tensor([0.0, 0.0, 0.0]),
                                           sdv).sample().cuda()
        noise *= min(1., epoch / satulation)
        obs[:, -3:] += noise
        return obs

    full_obs = env.reset()
    # print("init obs", full_obs)
    initial_state = full_obs[:, 2:2 + env.action_space.shape[0]]

    if args.env_name.find('doorenv') > -1 and env_obj.visionnet_input:
        obs = actor_critic.obs2inputs(full_obs, 0)
    else:
        if knob_noisy:
            obs = add_noise(full_obs)
        else:
            obs = full_obs

    if render_func is not None:
        render_func('human')

    if args.env_name.find('doorenv') > -1:
        if env_obj.xml_path.find("baxter") > -1:
            doorhinge_idx = 20
        elif env_obj.xml_path.find("float") > -1:
            if env_obj.xml_path.find("hook") > -1:
                doorhinge_idx = 6
            elif env_obj.xml_path.find("gripper") > -1:
                doorhinge_idx = 11
        else:
            if env_obj.xml_path.find("mobile") > -1:
                if env_obj.xml_path.find("hook") > -1:
                    doorhinge_idx = 9
                if env_obj.xml_path.find("gripper") > -1:
                    doorhinge_idx = 14
            else:
                if env_obj.xml_path.find("hook") > -1:
                    doorhinge_idx = 7
                if env_obj.xml_path.find("gripper") > -1:
                    doorhinge_idx = 12

    start_time = int(time.mktime(time.localtime()))

    i = 0
    epi_step = 0
    total_time = 0
    epi_counter = 1
    dooropen_counter = 0
    door_opened = False

    test_num = 100

    while True:
        with torch.no_grad():
            value, action, _, recurrent_hidden_states = actor_critic.act(
                obs, recurrent_hidden_states, masks, deterministic=args.det)

        next_action = action

        if i % 511 == 0: current_state = initial_state

        pos_control = False
        if pos_control:
            frame_skip = 1
            if i % (512 / frame_skip - 1) == 0: current_state = initial_state
            next_action = current_state + next_action
            for kk in range(frame_skip):
                full_obs, reward, done, infos = env.step(next_action)
        else:
            full_obs, reward, done, infos = env.step(next_action)

        current_state = full_obs[:, 2:2 + env.action_space.shape[0]]

        if args.env_name.find('doorenv') > -1 and env_obj.visionnet_input:
            obs = actor_critic.obs2inputs(full_obs, 0)
        else:
            if knob_noisy:
                obs = add_noise(full_obs)
            else:
                obs = full_obs

        masks.fill_(0.0 if done else 1.0)

        if render_func is not None:
            render_func('human')

        i += 1
        epi_step += 1

        if args.env_name.find('doorenv') > -1:
            if not door_opened and abs(
                    env_obj.sim.data.qpos[doorhinge_idx]) >= 0.2:
                dooropen_counter += 1
                opening_time = epi_step / 50
                print("door opened! opening time is {}".format(opening_time))
                total_time += opening_time
                door_opened = True

        if args.env_name.find('Fetch') > -1:
            if not door_opened and infos[0]['is_success'] == 1:
                dooropen_counter += 1
                opening_time = epi_step / 50
                print("Reached destenation! Time is {}".format(opening_time))
                total_time += opening_time
                door_opened = True

        if evaluation:
            if i % 512 == 511:
                if env_obj.unity:
                    env_obj.close()
                env = make_vec_envs(
                    args.env_name,
                    args.seed + 1000,
                    1,
                    None,
                    None,
                    device='cuda:0',
                    allow_early_resets=False,
                    env_kwargs=env_kwargs,
                )
                if render:
                    render_func = get_render_func(env)
                env_obj = env.venv.venv.envs[0].env.env
                if args.env_name.find('doorenv') <= -1:
                    env_obj.unity = None
                env.reset()
                print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(epi_counter))
                eval_print(dooropen_counter, epi_counter, start_time,
                           total_time)
                epi_counter += 1
                epi_step = 0
                door_opened = False

        if i >= 512 * test_num:
            eval_print(dooropen_counter, epi_counter - 1, start_time,
                       total_time)
            break
Пример #2
0
def onpolicy_main():
    print("onpolicy main")

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    summary_name = args.log_dir + '{0}_{1}'
    writer = SummaryWriter(summary_name.format(args.env_name, args.save_name))

    # Make vector env
    envs = make_vec_envs(
        args.env_name,
        args.seed,
        args.num_processes,
        args.gamma,
        args.log_dir,
        device,
        False,
        env_kwargs=env_kwargs,
    )

    # agly ways to access to the environment attirubutes
    if args.env_name.find('doorenv') > -1:
        if args.num_processes > 1:
            visionnet_input = envs.venv.venv.visionnet_input
            nn = envs.venv.venv.nn
            env_name = envs.venv.venv.xml_path
        else:
            visionnet_input = envs.venv.venv.envs[
                0].env.env.env.visionnet_input
            nn = envs.venv.venv.envs[0].env.env.env.nn
            env_name = envs.venv.venv.envs[0].env.env.env.xml_path
        dummy_obs = np.zeros(nn * 2 + 3)
    else:
        dummy_obs = envs.observation_space
        visionnet_input = None
        nn = None

    if pretrained_policy_load:
        print("loading", pretrained_policy_load)
        actor_critic, ob_rms = torch.load(pretrained_policy_load)
    else:
        actor_critic = Policy(dummy_obs.shape,
                              envs.action_space,
                              base_kwargs={'recurrent': args.recurrent_policy})

    if visionnet_input:
        visionmodel = load_visionmodel(env_name, args.visionmodel_path,
                                       VisionModelXYZ())
        actor_critic.visionmodel = visionmodel.eval()
    actor_critic.nn = nn
    actor_critic.to(device)

    #disable normalizer
    vec_norm = get_vec_normalize(envs)
    vec_norm.eval()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              dummy_obs.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    full_obs = envs.reset()
    initial_state = full_obs[:, :envs.action_space.shape[0]]

    if args.env_name.find('doorenv') > -1 and visionnet_input:
        obs = actor_critic.obs2inputs(full_obs, 0)
    else:
        if knob_noisy:
            obs = add_noise(full_obs, 0)
        else:
            obs = full_obs

    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                         args.lr)

        pos_control = False
        total_switches = 0
        prev_selection = ""
        for step in range(args.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
                next_action = action

            if pos_control:
                frame_skip = 2
                if step % (512 / frame_skip - 1) == 0:
                    current_state = initial_state
                next_action = current_state + next_action
                for kk in range(frame_skip):
                    full_obs, reward, done, infos = envs.step(next_action)

                current_state = full_obs[:, :envs.action_space.shape[0]]
            else:
                full_obs, reward, done, infos = envs.step(next_action)

            # convert img to obs if door_env and using visionnet
            if args.env_name.find('doorenv') > -1 and visionnet_input:
                obs = actor_critic.obs2inputs(full_obs, j)
            else:
                if knob_noisy:
                    obs = add_noise(full_obs, j)
                else:
                    obs = full_obs

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        writer.add_scalar("Value loss", value_loss, j)
        writer.add_scalar("action loss", action_loss, j)
        writer.add_scalar("dist entropy loss", dist_entropy, j)
        writer.add_scalar("Episode rewards", np.mean(episode_rewards), j)

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ],
                       os.path.join(
                           save_path, args.env_name +
                           "_{}.{}.pt".format(args.save_name, j)))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

        DR = True  #Domain Randomization
        ################## for multiprocess world change ######################
        if DR:
            print("changing world")

            envs.close_extras()
            envs.close()
            del envs

            envs = make_vec_envs(
                args.env_name,
                args.seed,
                args.num_processes,
                args.gamma,
                args.log_dir,
                device,
                False,
                env_kwargs=env_kwargs,
            )

            full_obs = envs.reset()
            if args.env_name.find('doorenv') > -1 and visionnet_input:
                obs = actor_critic.obs2inputs(full_obs, j)
            else:
                obs = full_obs
def main(raw_args=None):

    # If this is being called as a function from another python script
    if raw_args is not None:
        args = get_args(raw_args)
    else:
        args = main_args

    if args.algo != 'ipo':
        raise NotImplementedError

    # Total number of envs (both domains)
    args.num_processes = args.num_envs1 + args.num_envs2

    knob_noisy = args.knob_noisy
    pretrained_policy_load = args.pretrained_policy_load

    args.world_path_domain1 = os.path.expanduser(args.world_path_domain1)
    args.world_path_domain2 = os.path.expanduser(args.world_path_domain2)

    # Env kwargs for domain 1
    env_kwargs1 = dict(port = args.port,
                    visionnet_input = args.visionnet_input,
                    unity = args.unity,
                    world_path = args.world_path_domain1)

    # Env kwargs for domain 2
    env_kwargs2 = dict(port = args.port,
                    visionnet_input = args.visionnet_input,
                    unity = args.unity,
                    world_path = args.world_path_domain2)


    print("Training with IPO.")

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    summary_name = args.log_dir + '{0}_{1}'
    writer = SummaryWriter(summary_name.format(args.env_name, args.save_name))

    # Make vector env for two domains (each contains num_processes/2 envs)
    envs1 = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_envs1,
                         args.gamma, 
                         args.log_dir, 
                         device, 
                         False, 
                         env_kwargs=env_kwargs1)

    envs2 = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_envs2,
                         args.gamma, 
                         args.log_dir, 
                         device, 
                         False, 
                         env_kwargs=env_kwargs2)


    # agly ways to access to the environment attirubutes
    if args.env_name.find('doorenv')>-1:
        visionnet_input = envs1.venv.venv.visionnet_input
        nn = envs1.venv.venv.nn
        env_name = envs1.venv.venv.xml_path
            
        dummy_obs = np.zeros(nn*2+3)
    else:
        dummy_obs = envs1.observation_space
        visionnet_input = None
        nn = None

    if pretrained_policy_load:
        print("loading", pretrained_policy_load)
        actor_critic, ob_rms = torch.load(pretrained_policy_load)
    else:
        actor_critic = Policy_av(
            dummy_obs.shape,
            envs1.action_space,
            base_kwargs={'recurrent': args.recurrent_policy})

        # actor_critic = Policy(
        #     dummy_obs.shape,
        #     envs1.action_space,
        #     base_kwargs={'recurrent': args.recurrent_policy})
    
    if visionnet_input: 
        raise NotImplementedError
        visionmodel = load_visionmodel(env_name, args.visionmodel_path, VisionModelXYZ())  
        actor_critic.visionmodel = visionmodel.eval()

    actor_critic.nn = nn
    actor_critic.to(device)

    #disable normalizer
    vec_norm1 = get_vec_normalize(envs1)
    vec_norm1.eval()
    vec_norm2 = get_vec_normalize(envs2)
    vec_norm2.eval()
    
    # Create two agents (one for each domain)
    params1 = [{'params': actor_critic.base.actor1.parameters()}, 
    {'params': actor_critic.base.critic1.parameters()}, 
    {'params': actor_critic.base.critic_linear1.parameters()},
    {'params': actor_critic.base.fc_mean1.parameters()},
    {'params': actor_critic.base.logstd1.parameters()}]

    params2 = [{'params': actor_critic.base.actor2.parameters()}, 
    {'params': actor_critic.base.critic2.parameters()}, 
    {'params': actor_critic.base.critic_linear2.parameters()},
    {'params': actor_critic.base.fc_mean2.parameters()},
    {'params': actor_critic.base.logstd2.parameters()}]

    # params1 = None
    # params2 = None

    agent1 = algo.PPO(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=args.lr,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm,
        optim_params = params1)

    agent2 = algo.PPO(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=args.lr,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm,
        optim_params = params2)


    # Rollout storage for each domain
    rollouts1 = RolloutStorage(args.num_steps, args.num_envs1,
                              dummy_obs.shape, envs1.action_space,
                              actor_critic.recurrent_hidden_state_size)

    rollouts2 = RolloutStorage(args.num_steps, args.num_envs2,
                              dummy_obs.shape, envs2.action_space,
                              actor_critic.recurrent_hidden_state_size)


    full_obs1 = envs1.reset()
    initial_state1 = full_obs1[:,:envs1.action_space.shape[0]]

    full_obs2 = envs2.reset()
    initial_state2 = full_obs2[:,:envs2.action_space.shape[0]]

    if args.env_name.find('doorenv')>-1 and visionnet_input:
        obs1 = actor_critic.obs2inputs(full_obs1, 0)
        obs2 = actor_critic.obs2inputs(full_obs2, 0)
    else:
        if knob_noisy:
            obs1 = add_noise(full_obs1, 0)
            obs2 = add_noise(full_obs2, 0)
        else:
            obs1 = full_obs1
            obs2 = full_obs2

    rollouts1.obs[0].copy_(obs1)
    rollouts1.to(device)

    rollouts2.obs[0].copy_(obs2)
    rollouts2.to(device)

    episode_rewards1 = deque(maxlen=10)
    episode_rewards2 = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    num_updates = int(num_updates/2) # Since have two domains per iteration

    best_training_reward = -np.inf

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent1.optimizer, j, num_updates, args.lr)
            utils.update_linear_schedule(
                agent2.optimizer, j, num_updates, args.lr)

        ################## Do rollouts and updates for domain 1 ##################

        pos_control = False
        total_switches = 0
        prev_selection = ""
        for step in range(args.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts1.obs[step], rollouts1.recurrent_hidden_states[step],
                    rollouts1.masks[step])
                next_action = action 

            try:
                # print(next_action)
                full_obs, reward, done, infos = envs1.step(next_action)
            except:
                ipy.embed()

            if knob_noisy:
                obs = add_noise(full_obs, j)
            else:
                obs = full_obs

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards1.append(info['episode']['r'])

            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts1.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
            
        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts1.obs[-1], rollouts1.recurrent_hidden_states[-1],
                rollouts1.masks[-1]).detach()

        rollouts1.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent1.update(rollouts1)
        rollouts1.after_update()
        value_loss1 = value_loss
        action_loss1 = action_loss
        dist_entropy1 = dist_entropy

        ################## Do rollouts and updates for domain 2 ##################

        pos_control = False
        total_switches = 0
        prev_selection = ""
        for step in range(args.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts2.obs[step], rollouts2.recurrent_hidden_states[step],
                    rollouts2.masks[step])
                next_action = action 

            try:
                # print(next_action)
                full_obs, reward, done, infos = envs2.step(next_action)
            except:
                ipy.embed()

            if knob_noisy:
                obs = add_noise(full_obs, j)
            else:
                obs = full_obs

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards2.append(info['episode']['r'])

            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts2.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
            
        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts2.obs[-1], rollouts2.recurrent_hidden_states[-1],
                rollouts2.masks[-1]).detach()

        rollouts2.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent2.update(rollouts2)
        rollouts2.after_update()
        value_loss2 = value_loss
        action_loss2 = action_loss
        dist_entropy2 = dist_entropy

        ###################### Logs and storage ########################

        value_loss = (value_loss1 + value_loss2)/2
        action_loss = (action_loss1 + action_loss2)/2
        dist_entropy = (dist_entropy1 + dist_entropy2)/2
        episode_rewards = []
        for ii in range(len(episode_rewards1)):
            episode_rewards.append((episode_rewards1[ii]+episode_rewards2[ii])/2)
        # episode_rewards = episode_rewards1

        writer.add_scalar("Value loss", value_loss, j)
        writer.add_scalar("action loss", action_loss, j)
        writer.add_scalar("dist entropy loss", dist_entropy, j)
        writer.add_scalar("Episode rewards", np.mean(episode_rewards), j)

        if np.mean(episode_rewards) > best_training_reward:
            best_training_reward = np.mean(episode_rewards)
            current_is_best = True
        else:
            current_is_best = False

        # save for every interval-th episode or for the last epoch or for best so far
        if (j % args.save_interval == 0
                or j == num_updates - 1 or current_is_best) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            torch.save([
                    actor_critic,
                    None
                ], os.path.join(save_path, args.env_name + "_{}.{}.pt".format(args.save_name,j)))

            if current_is_best:
                torch.save([
                    actor_critic,
                    None
                ], os.path.join(save_path, args.env_name + "_{}.best.pt".format(args.save_name)))
            
            # torch.save([
            #     actor_critic,
            #     getattr(utils.get_vec_normalize(envs1), 'ob_rms', None)
            # ], os.path.join(save_path, args.env_name + "_{}.{}.pt".format(args.save_name,j)))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            raise NotImplementedError
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

        DR=False # True #Domain Randomization
        ################## for multiprocess world change ######################
        if DR:
            raise NotImplementedError

            print("changing world")

            envs.close_extras()
            envs.close()
            del envs

            envs = make_vec_envs_domains(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma, 
                         args.log_dir, 
                         device, 
                         False, 
                         env_kwargs1=env_kwargs1,
                         env_kwargs2=env_kwargs2)

            full_obs = envs.reset()
            if args.env_name.find('doorenv')>-1 and visionnet_input:
                obs = actor_critic.obs2inputs(full_obs, j)
            else:
                obs = full_obs
Пример #4
0
def offpolicy_main(variant):
    print("offpolicy main")

    if args.algo == 'sac':
        algo = "SAC"
    elif args.algo == 'td3':
        algo = "TD3"

    setup_logger('{0}_{1}'.format(args.env_name, args.save_name),
                 variant=variant)
    ptu.set_gpu_mode(True)  # optionally set the GPU (default=True)

    expl_env, eval_env, env_obj = prepare_env(args.env_name,
                                              args.visionmodel_path,
                                              **env_kwargs)
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    expl_policy, eval_policy, trainer = prepare_trainer(
        algo, expl_env, obs_dim, action_dim, args.pretrained_policy_load,
        variant)

    if args.env_name.find('doorenv') > -1:
        expl_policy.knob_noisy = eval_policy.knob_noisy = args.knob_noisy
        expl_policy.nn = eval_policy.nn = env_obj.nn
        expl_policy.visionnet_input = eval_policy.visionnet_input = env_obj.visionnet_input

    if args.visionnet_input:
        visionmodel = load_visionmodel(expl_env._wrapped_env.xml_path,
                                       args.visionmodel_path, VisionModelXYZ())
        visionmodel.to(ptu.device)
        expl_policy.visionmodel = visionmodel.eval()
    else:
        expl_policy.visionmodel = None

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
        doorenv=args.env_name.find('doorenv') > -1,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
        doorenv=args.env_name.find('doorenv') > -1,
    )

    if not args.replaybuffer_load:
        replay_buffer = EnvReplayBuffer(
            variant['replay_buffer_size'],
            expl_env,
        )
    else:
        replay_buffer = pickle.load(open(args.replaybuffer_load, "rb"))
        replay_buffer._env_info_keys = replay_buffer.env_info_sizes.keys()
        print("Loaded the replay buffer that has length of {}".format(
            replay_buffer.get_diagnostics()))

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])

    algorithm.save_interval = args.save_interval
    algorithm.save_dir = args.save_dir
    algorithm.algo = args.algo
    algorithm.env_name = args.env_name
    algorithm.save_name = args.save_name
    algorithm.env_kwargs = env_kwargs
    summary_name = args.log_dir + '{0}_{1}'
    writer = SummaryWriter(summary_name.format(args.env_name, args.save_name))
    algorithm.writer = writer

    algorithm.to(ptu.device)
    algorithm.train()
Пример #5
0
def onpolicy_inference(seed,
                       env_name,
                       det,
                       load_name,
                       evaluation,
                       render,
                       knob_noisy,
                       visionnet_input,
                       env_kwargs,
                       actor_critic=None,
                       verbose=True,
                       pos_control=True,
                       step_skip=4):

    env = make_vec_envs(
        env_name,
        seed + 1000,
        1,
        None,
        None,
        device='cuda:0',
        allow_early_resets=False,
        env_kwargs=env_kwargs,
    )

    env_obj = env.venv.venv.envs[0].env.env
    if env_name.find('door') <= -1:
        env_obj.unity = None

    render_func = get_render_func(env)
    if evaluation and not render:
        render_func = None

    if env_kwargs['visionnet_input']:
        visionmodel = VisionModelXYZ()
        visionmodel = load_visionmodel(load_name, args.visionmodel_path,
                                       VisionModelXYZ())

    if not actor_critic:
        actor_critic, ob_rms = torch.load(load_name)
    actor_critic = actor_critic.eval()
    if env_kwargs['visionnet_input'] and env_name.find('doorenv') > -1:
        actor_critic.visionmodel = visionmodel
        actor_critic.visionnet_input = env_obj.visionnet_input
    actor_critic.to("cuda:0")

    if env_name.find('doorenv') > -1:
        actor_critic.nn = env_obj.nn

    recurrent_hidden_states = torch.zeros(
        1, actor_critic.recurrent_hidden_state_size)
    masks = torch.zeros(1, 1)

    full_obs = env.reset()
    initial_state = full_obs[:, :env.action_space.shape[0]]

    if env_name.find('doorenv') > -1 and env_obj.visionnet_input:
        obs = actor_critic.obs2inputs(full_obs, 0)
    else:
        if knob_noisy:
            obs = add_noise(full_obs)
        else:
            obs = full_obs

    if render_func is not None:
        render_func('human')

    # if env_name.find('doorenv')>-1:
    #     if env_obj.xml_path.find("baxter")>-1:
    #         doorhinge_idx = 20
    #     elif env_obj.xml_path.find("float")>-1:
    #         if env_obj.xml_path.find("hook")>-1:
    #             doorhinge_idx = 6
    #         elif env_obj.xml_path.find("gripper")>-1:
    #             doorhinge_idx = 11
    #     else:
    #         if env_obj.xml_path.find("mobile")>-1:
    #             if env_obj.xml_path.find("hook")>-1:
    #                 doorhinge_idx = 9
    #             if env_obj.xml_path.find("gripper")>-1:
    #                 doorhinge_idx = 14
    #         else:
    #             if env_obj.xml_path.find("hook")>-1:
    #                 doorhinge_idx = 7
    #             if env_obj.xml_path.find("gripper")>-1:
    #                 doorhinge_idx = 12

    start_time = int(time.mktime(time.localtime()))

    i = 0
    epi_step = 0
    total_time = 0
    epi_counter = 1
    dooropen_counter = 0
    door_opened = False
    test_num = 100

    while True:
        with torch.no_grad():
            value, action, _, recurrent_hidden_states = actor_critic.act(
                obs, recurrent_hidden_states, masks, deterministic=det)
        next_action = action

        if pos_control:
            # print("enjoy step_skip",step_skip)
            if i % (512 / step_skip - 1) == 0: current_state = initial_state
            next_action = current_state + next_action
            for kk in range(step_skip):
                full_obs, reward, done, infos = env.step(next_action)

            current_state = full_obs[:, :env.action_space.shape[0]]
        else:
            for kk in range(step_skip):
                full_obs, reward, done, infos = env.step(next_action)

        if env_name.find('doorenv') > -1 and env_obj.visionnet_input:
            obs = actor_critic.obs2inputs(full_obs, 0)
        else:
            if knob_noisy:
                obs = add_noise(full_obs)
            else:
                obs = full_obs

        masks.fill_(0.0 if done else 1.0)

        if render_func is not None:
            render_func('human')

        i += 1
        epi_step += 1

        if env_name.find('doorenv') > -1:
            # if not door_opened and abs(env_obj.sim.data.qpos[doorhinge_idx])>=0.2:
            if not door_opened and abs(env_obj.get_doorangle()) >= 0.2:
                dooropen_counter += 1
                opening_time = epi_step / (1.0 / mujoco_timestep) * step_skip
                if verbose:
                    print(
                        "door opened! opening time is {}".format(opening_time))
                total_time += opening_time
                door_opened = True

        if env_name.find('Fetch') > -1:
            if not door_opened and infos[0]['is_success'] == 1:
                dooropen_counter += 1
                opening_time = epi_step / (1.0 / mujoco_timestep) * step_skip
                if verbose:
                    print(
                        "Reached destenation! Time is {}".format(opening_time))
                total_time += opening_time
                door_opened = True

        if evaluation:
            if i % (512 / step_skip - 1) == 0:
                if env_obj.unity:
                    env_obj.close()
                env = make_vec_envs(
                    env_name,
                    seed + 1000,
                    1,
                    None,
                    None,
                    device='cuda:0',
                    allow_early_resets=False,
                    env_kwargs=env_kwargs,
                )

                if render:
                    render_func = get_render_func(env)
                env_obj = env.venv.venv.envs[0].env.env
                if env_name.find('doorenv') <= -1:
                    env_obj.unity = None
                env.reset()
                if verbose:
                    print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(
                        epi_counter))
                    eval_print(dooropen_counter, epi_counter, start_time,
                               total_time)
                epi_counter += 1
                epi_step = 0
                door_opened = False

        if i >= 512 / step_skip * test_num:
            if verbose:
                print("dooropening counter:", dooropen_counter,
                      " epi counter:", epi_counter)
                eval_print(dooropen_counter, epi_counter - 1, start_time,
                           total_time)
            break

    opening_rate, opening_timeavg = eval_print(dooropen_counter,
                                               epi_counter - 1, start_time,
                                               total_time)
    return opening_rate, opening_timeavg