Пример #1
0
def train(train_states,
          run_dir,
          num_env_steps,
          eval_env_steps,
          writer,
          writer_name,
          args,
          init_model=None):
    envs = make_vec_envs(train_states, args.seed, args.num_processes,
                         args.gamma, 'cpu', 'train', args)

    if init_model:
        actor_critic, env_step, model_name = init_model
        obs_space = actor_critic.obs_space
        obs_process = actor_critic.obs_process
        obs_module = actor_critic.obs_module
        print(f"  [load] Loaded model {model_name} at step {env_step}")
    else:
        obs_space = envs.observation_space
        actor_critic = Policy(obs_space,
                              args.obs_process,
                              args.obs_module,
                              envs.action_space,
                              base_kwargs={'recurrent': args.recurrent_policy})
        env_step = 0
    actor_critic.to(args.device)
    #print(actor_critic)

    run_name = run_dir.replace('/', '_')
    vid_save_dir = f"{run_dir}/videos/"
    try:
        os.makedirs(vid_save_dir)
    except OSError:
        pass
    ckpt_save_dir = f"{run_dir}/ckpts/"
    try:
        os.makedirs(ckpt_save_dir)
    except OSError:
        pass

    if args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         args.device,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm,
                               acktr=False)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm,
                               acktr=True)
    else:
        raise NotImplementedError

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    actor_critic.eval()
    """
    try:
        writer.add_graph(actor_critic, obs)
    except ValueError:
        print("Unable to write model graph to tensorboard.")
    """
    actor_critic.train()

    for k in rollouts.obs.keys():
        rollouts.obs[k][0].copy_(obs[k][0])

    episode_rewards = deque(maxlen=10)

    num_updates = num_env_steps // args.num_steps // args.num_processes
    batch_size = args.num_steps * args.num_processes
    start = time.time()
    while env_step < num_env_steps:
        s = time.time()
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states, _ = actor_critic.act(
                    {
                        k: rollouts.obs[k][step].float().to(args.device)
                        for k in rollouts.obs.keys()
                    }, rollouts.recurrent_hidden_states[step].to(args.device),
                    rollouts.masks[step].to(args.device))
                value = value.cpu()
                action = action.cpu()
                action_log_prob = action_log_prob.cpu()
                recurrent_hidden_states = recurrent_hidden_states.cpu()
            # Observe reward and next obs
            obs, reward, dones, infos = envs.step(action)

            for done, info in zip(dones, infos):
                env_state = info['env_state'][1]
                if done:
                    writer.add_scalar(f'train_episode_x/{env_state}',
                                      info['max_x'], env_step)
                    writer.add_scalar(f'train_episode_%/{env_state}',
                                      info['max_x'] / info['lvl_max_x'] * 100,
                                      env_step)
                    writer.add_scalar(f'train_episode_r/{env_state}',
                                      info['sum_r'], env_step)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done else [1.0]
                                       for done in dones])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
        with torch.no_grad():
            next_value = actor_critic.get_value(
                {
                    k: rollouts.obs[k][-1].float().to(args.device)
                    for k in rollouts.obs.keys()
                }, rollouts.recurrent_hidden_states[-1].to(args.device),
                rollouts.masks[-1].to(args.device)).detach().cpu()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        env_step += batch_size
        fps = batch_size / (time.time() - s)
        #res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
        #writer.add_scalar(f'gpu_usage/{writer_name}', res.gpu, env_step)
        #writer.add_scalar(f'gpu_mem/{writer_name}', res.memory, env_step)
        total_norm = 0
        for p in list(
                filter(lambda p: p.grad is not None,
                       actor_critic.parameters())):
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item()**2
        total_norm = total_norm**(1. / 2)
        obs_norm = {}
        for obs_name in args.obs_keys:
            t_norm = 0
            if obs_name == 'video':
                md = actor_critic.base.video_module
            elif obs_name == 'audio':
                md = actor_critic.base.audio_module
            else:
                raise NotImplementedError
            for p in list(filter(lambda p: p.grad is not None,
                                 md.parameters())):
                param_norm = p.grad.data.norm(2)
                t_norm += param_norm.item()**2
            obs_norm[obs_name] = t_norm**(1. / 2)

        prev_env_step = max(0, env_step + 1 - batch_size)
        # write training metrics for this batch, usually takes 0.003s
        if (env_step + 1
            ) // args.write_interval > prev_env_step // args.write_interval:
            writer.add_scalar(f'grad_norm/{writer_name}', total_norm, env_step)
            writer.add_scalar(f'fps/{writer_name}', fps, env_step)
            writer.add_scalar(f'value_loss/{writer_name}',
                              value_loss / batch_size, env_step)
            writer.add_scalar(f'action_loss/{writer_name}',
                              action_loss / batch_size, env_step)
            writer.add_scalar(f'dist_entropy/{writer_name}',
                              dist_entropy / batch_size, env_step)
            writer.add_scalar(f'cpu_usage/{writer_name}', psutil.cpu_percent(),
                              env_step)
            writer.add_scalar(f'cpu_mem/{writer_name}',
                              psutil.virtual_memory()._asdict()['percent'],
                              env_step)
            for obs_name in args.obs_keys:
                writer.add_scalar(f'grad_norm_{obs_name}/{writer_name}',
                                  obs_norm[obs_name], env_step)

        # print log to console
        if (env_step +
                1) // args.log_interval > prev_env_step // args.log_interval:
            end = time.time()
            print("  [log] Env step {} of {}: {:.1f}s, {:.1f}fps".format(
                env_step + 1, num_env_steps, end - start, fps))
            if len(episode_rewards) > 0:
                print(
                    "    Last {} episodes: mean/med reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards), np.max(episode_rewards)))
            print(
                "    dist_entropy {:.5f}, value_loss {:.6f}, action_loss {:.6f}, grad_norm {:.6f}"
                .format(dist_entropy, value_loss, action_loss, total_norm))
            start = time.time()

        # save model to ckpt
        if ((env_step + 1) // args.save_interval >
                prev_env_step // args.save_interval):
            torch.save([
                actor_critic,
                env_step,
                run_name,
            ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt"))
            print(f"  [save] Saved model at step {env_step+1}.")

        # save model to ckpt and run evaluation if eval_interval and not final iteration in training loop
        if ((env_step + 1) // args.eval_interval >
                prev_env_step // args.eval_interval
            ) and env_step < num_env_steps and eval_env_steps > 0:
            torch.save([
                actor_critic,
                env_step,
                run_name,
            ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt"))
            print(f"  [save] Saved model at step {env_step+1}.")

            envs.close()
            del envs  # close does not actually get rid of envs, need to del
            actor_critic.eval()
            eval_score, e_dict = evaluate(train_states, actor_critic,
                                          eval_env_steps, env_step, writer,
                                          vid_save_dir, args.vid_tb_steps,
                                          args.vid_file_steps,
                                          args.obs_viz_layer, args)
            print(f"  [eval] Evaluation score: {eval_score}")
            writer.add_scalar('eval_score', eval_score, env_step)

            actor_critic.train()
            envs = make_vec_envs(train_states, args.seed, args.num_processes,
                                 args.gamma, 'cpu', 'train', args)
            obs = envs.reset()
            # TODO: does this work? do we need to increment env step or something? whydden_states insert at 0
            for k in rollouts.obs.keys():
                rollouts.obs[k][0].copy_(obs[k][0])

    # final model save
    final_model_path = os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt")
    torch.save([
        actor_critic,
        env_step,
        run_name,
    ], final_model_path)
    print(
        f"  [save] Final model saved at step {env_step+1} to {final_model_path}"
    )

    # final model eval
    envs.close()
    del envs
    eval_score = None
    eval_dict = None
    if eval_env_steps > 0:
        eval_score, eval_dict = evaluate(train_states, actor_critic,
                                         eval_env_steps, env_step, writer,
                                         vid_save_dir, args.vid_tb_steps,
                                         args.vid_file_steps,
                                         args.obs_viz_layer, args)
        print(f"  [eval] Final model evaluation score: {eval_score:.3f}")

    return (actor_critic, env_step, run_name), eval_score, eval_dict
Пример #2
0
def main():

    realEval = True  #False

    gettrace = getattr(sys, 'gettrace', None)

    parser = argparse.ArgumentParser(description='RL')
    parser.add_argument('--action-type',
                        type=int,
                        default=-1,
                        help='action type to play (default: -1)')

    parser.add_argument('--tasks-difficulty-from',
                        type=int,
                        default=0,
                        help='tasks_difficulty_from')

    parser.add_argument('--tasks-difficulty-to',
                        type=int,
                        default=100000,
                        help='tasks-difficulty-to')

    parser.add_argument('--verboseLevel',
                        type=int,
                        default=5,
                        help='verboseLevel')

    parser.add_argument('--filesNamesSuffix',
                        default="",
                        help='filesNamesSuffix')

    parser.add_argument('--nobest-exit',
                        type=int,
                        default=10000,
                        help='nobest_exit')

    args = get_args(parser)

    args.algo = 'ppo'
    args.env_name = 'QuadruppedWalk-v1'  #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1'
    args.use_gae = True
    args.num_steps = 2048
    #args.num_processes = 4
    args.num_processes = 4
    if gettrace():
        args.num_processes = 1
    args.lr = 0.0001
    args.entropy_coef = 0.0
    args.value_loss_coef = 0.5
    args.ppo_epoch = 4
    args.num_mini_batch = 256
    args.gamma = 0.99
    args.gae_lambda = 0.95
    args.clip_param = 0.2
    args.use_linear_lr_decay = True  #True #True #True #True
    args.use_proper_time_limits = True
    args.save_dir = "./trained_models/" + args.env_name + "/"
    args.load_dir = "./trained_models/" + args.env_name + "/"
    args.log_dir = "./logs/robot"
    if gettrace():
        args.save_dir = "./trained_models/" + args.env_name + "debug/"
        args.load_dir = "./trained_models/" + args.env_name + "debug/"
        args.log_dir = "./logs/robot_d"
    args.log_interval = 30
    args.hidden_size = 64
    args.last_hidden_size = args.hidden_size
    args.recurrent_policy = False  #True
    args.save_interval = 20
    #args.seed = 1
    reward_shaping = 0.01
    allowMutate = False

    if args.seed == -1:
        args.seed = time.clock_gettime_ns(time.CLOCK_REALTIME)

    quadruppedEnv.settings.tasks_difficulty_from = args.tasks_difficulty_from
    quadruppedEnv.settings.tasks_difficulty_to = args.tasks_difficulty_to

    # 0 is a walk
    # 1 is a balance
    # 2 multitasks
    # 3 multitask experiments
    trainType = 14
    filesNamesSuffix = ""
    if args.action_type >= 0:
        trainType = args.action_type

    makeEnvFunction = makeEnv.make_env_with_best_settings
    if trainType == 1:
        filesNamesSuffix = "balance_"
        makeEnvFunction = makeEnv.make_env_for_balance

    if trainType == 2:
        filesNamesSuffix = "analytical_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical

    if trainType == 3:
        filesNamesSuffix = "analytical2_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical2

    if trainType == 4:
        filesNamesSuffix = "frontback_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_front_back

    if trainType == 5:
        filesNamesSuffix = "leftright_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_left_right

    if trainType == 6:
        filesNamesSuffix = "all_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all

    if trainType == 7:
        filesNamesSuffix = "rotate_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_rotate

    if trainType == 8:
        filesNamesSuffix = "compound_"
        makeEnvFunction = make_env_multinetwork

    if trainType == 9:
        import pickle
        realEval = False
        allowMutate = False
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 5000000
        filesNamesSuffix = "test_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_test

    if trainType == 10:
        import pickle
        realEval = False
        allowMutate = False
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 5000000
        filesNamesSuffix = "zoo_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_test_zoo

    if trainType == 11:
        args.hidden_size = 128  #64 #128
        args.last_hidden_size = args.hidden_size

        import pickle
        if gettrace():
            args.num_processes = 1
        else:
            args.num_processes = 8
        realEval = False
        allowMutate = False
        args.lr = 0.00001
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 10000000
        filesNamesSuffix = "zigote2_updown_"
        print("Samples preload")
        global samplesEnvData
        samplesEnvData = pickle.load(
            open("./QuadruppedWalk-v1_MoveNoPhys.samples", "rb"))
        # samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1.samples", "rb" ) )
        makeEnvFunction = makeSamplesEnv

    if trainType == 12:
        import pickle
        args.lr = 0.00001
        args.hidden_size = 64
        args.last_hidden_size = args.hidden_size
        filesNamesSuffix = "zigote2_front_back_"
        args.clip_param = 0.9
        args.value_loss_coef = 0.9
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_train
        #makeEnvFunction = makeEnv.make_env_with_best_settings_for_record
        #makeEnv.samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1_MoveNoPhys.samples", "rb" ) )

    if trainType == 13:
        filesNamesSuffix = "all_bytasks_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all

    if trainType == 14:
        #args.lr = 0.00001
        #args.num_env_steps = 000000
        #args.clip_param = 0.5
        #args.value_loss_coef  =0.8
        #random.seed(time.clock_gettime_ns(time.CLOCK_REALTIME))
        #args.num_steps = random.choice([256,512,1024,2048,4096])
        #args.num_mini_batch = random.choice([32,64,256,512])
        #args.ppo_epoch  = random.choice([2,4,8,10])
        #args.clip_param = random.choice([0.2,0.4,0.6,0.8])
        #args.value_loss_coef  =random.choice([0.4,0.5,0.6,0.8])
        #args.lr = random.choice([0.00001,0.0001,0.00005,0.0005])

        args.num_steps = 2048
        args.num_mini_batch = 64
        args.ppo_epoch = 8
        args.lr = 0.0001

        args.hidden_size = 64
        args.last_hidden_size = args.hidden_size
        #
        filesNamesSuffix = args.filesNamesSuffix
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all
        '''
        num_steps: 1024 num_mini_batch 64 ppo_epoch 2
        clip_param: 0.2 value_loss_coef 0.6 lr 0.0001
        '''

    if trainType == 15:
        args.num_env_steps = 5000000
        filesNamesSuffix = "zigote_updown_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_train_analytic

    if trainType == 16:
        args.lr = 0.00001
        filesNamesSuffix = "compound_tasks_"
        makeEnvFunction = make_env_multinetwork

    reward_shaper = DefaultRewardsShaper(scale_value=reward_shaping)

    print("ActionType ", trainType, " ", filesNamesSuffix, "seed", args.seed,
          "num env steps:", args.num_env_steps, " tasks_dif",
          args.tasks_difficulty_from, args.tasks_difficulty_to)

    print("Num processes:", args.num_processes)

    print("num_steps:", args.num_steps, "num_mini_batch", args.num_mini_batch,
          "ppo_epoch", args.ppo_epoch)
    print("clip_param:", args.clip_param, "value_loss_coef",
          args.value_loss_coef, "lr", args.lr)

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    args.log_dir = "/tmp/tensorboard/"
    #TesnorboardX
    writer = SummaryWriter(log_dir=args.log_dir + 'runs/{}_PPO_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
        "ppo"))

    writer.add_scalar('options/num_steps', args.num_steps, 0)
    writer.add_scalar('options/num_mini_batch', args.num_mini_batch, 0)
    writer.add_scalar('options/ppo_epoch', args.ppo_epoch, 0)
    writer.add_scalar('options/clip_param', args.clip_param, 0)
    writer.add_scalar('options/value_loss_coef', args.value_loss_coef, 0)
    writer.add_scalar('options/lr', args.lr, 0)

    device = torch.device("cuda:0" if args.cuda else "cpu")
    torch.set_num_threads(1)

    load_dir = os.path.join(args.load_dir, args.algo)

    multiNetworkName = ["frontback_", "all_", "leftright_", "rotate_"]
    if trainType == 8:
        for net in multiNetworkName:
            bestFilename = os.path.join(
                load_dir, "{}_{}{}_best.pt".format(args.env_name, net,
                                                   args.hidden_size))
            ac, _ = torch.load(bestFilename)
            policies.append(PPOPlayer(ac, device))
            print("Policy multi loaded: ", bestFilename)

    multiNetworkName2 = [
        "all_bytasks_0_",
        "all_bytasks_1_",
        "all_bytasks_2_",
        "all_bytasks_3_",
        "all_bytasks_4_",
        "all_bytasks_5_",
        "all_bytasks_6_",
        "all_bytasks_7_",
        "all_bytasks_8_",
        "all_bytasks_9_",
        "all_bytasks_10_",
        "all_bytasks_11_",
        "all_bytasks_12_",
    ]
    if trainType == 16:
        for net in multiNetworkName2:
            bestFilename = os.path.join(
                load_dir, "{}_{}{}_best.pt".format(args.env_name, net,
                                                   args.hidden_size))
            ac, _ = torch.load(bestFilename)
            policies.append(PPOPlayer(ac, device))
            print("Policy multi loaded: ", bestFilename)

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         None,
                         device,
                         False,
                         normalizeOb=False,
                         normalizeReturns=False,
                         max_episode_steps=args.num_steps,
                         makeEnvFunc=makeEnvFunction,
                         num_frame_stack=1,
                         info_keywords=(
                             'episode_steps',
                             'episode_reward',
                             'progress',
                             'servo',
                             'distToTarget',
                         ))
    #print(envs.observation_space.shape,envs.action_space)
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'hidden_size': args.hidden_size,
                              'last_hidden_size': args.last_hidden_size,
                              'activation_layers_type': "Tanh"
                          })
    '''
#    if args.load_dir not None:
    load_path = os.path.join(args.load_dir, args.algo)
    actor_critic, ob_rms = torch.load(os.path.join(load_path, args.env_name + ".pt"))
    '''
    load_path = os.path.join(
        load_dir, "{}_{}{}_best.pt".format(args.env_name, filesNamesSuffix,
                                           args.hidden_size))
    #load_path = os.path.join(load_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
    preptrained_path = "../Train/trained_models/QuadruppedWalk-v1/Train_QuadruppedWalk-v1_256.pth"
    loadPretrained = False
    if loadPretrained and os.path.isfile(preptrained_path):
        print("Load preptrained")
        abj = torch.load(preptrained_path)
        print(abj)
        print(actor_critic.base)
        actor_critic.base.load_state_dict()
        actor_critic.base.eval()
    if os.path.isfile(load_path) and not loadPretrained:
        actor_critic, ob_rms = torch.load(load_path)
        actor_critic.eval()
        print("----NN loaded: ", load_path, " -----")
    else:
        bestFilename = os.path.join(
            load_dir,
            "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix,
                                              args.hidden_size))
        if os.path.isfile(bestFilename):
            actor_critic, ob_rms = torch.load(bestFilename)
            actor_critic.eval()
            print("----NN loaded: ", bestFilename, " -----")

    maxReward = -10000.0
    maxSteps = 0
    minDistance = 50000.0

    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    deque_maxLen = 10

    episode_rewards = deque(maxlen=deque_maxLen)
    episode_steps = deque(maxlen=deque_maxLen)
    episode_rewards_alive = deque(maxlen=deque_maxLen)
    episode_rewards_progress = deque(maxlen=deque_maxLen)
    episode_rewards_servo = deque(maxlen=deque_maxLen)
    episode_dist_to_target = deque(maxlen=deque_maxLen)
    '''
    load_path = os.path.join(args.load_dir, args.algo)
    load_path = os.path.join(load_path, args.env_name + ".pt")
    actor_critic, ob_rms = torch.load(load_path)

    actor_critic.to(device)
    actor_critic.eval()
    #ob_rms.eval()
    '''
    '''
    args.use_gym_monitor = 1
    args.monitor_dir = "./results/"
    monitor_path = os.path.join(args.monitor_dir, args.algo)
    monitor_path = os.path.join(monitor_path, args.env_name)

    args.
    if args.use_gym_monitor:
        env = wrappers.Monitor(
            env, monitor_path, video_callable=False, force=True)
    '''
    i_episode = 0

    save_path = os.path.join(args.save_dir, args.algo)
    try:
        os.makedirs(save_path)
    except OSError:
        pass

    trainOnSamplesAndExit = False  #False
    if trainOnSamplesAndExit:
        import pickle
        print("---------------------------------------")
        print("Samples preload")
        data = pickle.load(open("./QuadruppedWalk-v1_UpDown.samples", "rb"))
        #data = pickle.load( open( "../QuadruppedWalk-v1_NN.samples", "rb" ) )

        learning_rate = 0.0001
        max_episodes = 100
        max_timesteps = 4000
        betas = (0.9, 0.999)
        log_interval = 1

        envSamples = SamplesEnv(data)
        envSamples.numSteps = max_timesteps

        # create a stochastic gradient descent optimizer
        optimizer = torch.optim.Adam(actor_critic.base.actor.parameters(),
                                     lr=learning_rate,
                                     betas=betas)
        #optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
        # create a loss function
        criterion = nn.MSELoss(reduction="sum")

        # run the main training loop
        for epoch in range(max_episodes):
            state = envSamples.reset()
            time_step = 0
            testReward = 0
            testSteps = 0
            loss_sum = 0
            loss_max = 0

            for t in range(max_timesteps):
                time_step += 1

                nn_state = torch.FloatTensor((state).reshape(1, -1)).to(device)

                optimizer.zero_grad()
                net_out = actor_critic.base.forwardActor(nn_state)
                net_out = actor_critic.dist.fc_mean(net_out)

                state, reward, done, info = envSamples.step(
                    net_out.detach().numpy())
                sim_action = envSamples.recordedActions

                sim_action_t = torch.FloatTensor([sim_action]).to(device)

                loss = criterion(net_out, sim_action_t)
                loss.backward()
                optimizer.step()
                loss_sum += loss.mean()
                loss_max = max(loss_max, loss.max())

                testReward += reward
                testSteps += 1

                if done:
                    if epoch % log_interval == 0:
                        #print(best_action_t*scaleActions-net_out*scaleActions)
                        if args.verboseLevel > 0:
                            print(
                                'Train Episode: {} t:{} Reward:{} Loss: mean:{:.6f} max: {:.6f}'
                                .format(epoch, t, testReward, loss_sum / t,
                                        loss_max))
                            print(info)
                        reward = 0
                    break
        bestFilename = os.path.join(
            save_path,
            "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix,
                                              args.hidden_size))
        torch.save([
            actor_critic,
            getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
        ], bestFilename)
        exit(0)

    skipWriteBest = True

    if args.verboseLevel > 0:
        printNetwork(actor_critic.base.actor)

    lock(actor_critic, first=False, last=False)
    #if trainType==9:
    #allowMutate = False
    #lock(actor_critic,first=True,last=False)
    #mutate(actor_critic,power=0.00,powerLast=0.3)

    if args.verboseLevel > 0:
        printNetwork(actor_critic.base.actor)
    #from torchsummary import summary

    #summary(actor_critic.base.actor, (1, 48, 64))

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    episodeBucketIndex = 0

    maxReward = -10000000000
    numEval = 10
    if realEval:
        envEval = makeEnvFunction(args.env_name)
        if hasattr(envEval.env, "tasks") and len(envEval.env.tasks):
            numEval = max(numEval, len(envEval.env.tasks))
        maxReward = evaluate_policy(envEval,
                                    actor_critic,
                                    numEval * 2,
                                    render=False,
                                    device=device,
                                    verbose=args.verboseLevel)
        print("MaxReward on start", maxReward)

    noMaxRewardCount = 0

    updateIndex = 0

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        episode_r = 0.0
        stepsDone = 0

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            #envs.venv.venv.venv.envs[0].render()

            if args.verboseLevel > 0:
                index = 0
                for d in done:
                    if d:
                        print(infos[index], flush=True)
                    index += 1

            episodeDone = False
            '''
            index = 0
            for d in done:
                if d:
                    print("")
                    print(infos[index])
                index+=1
            '''

            for info in infos:
                if 'reward' in info.keys():
                    episodeDone = True
                    i_episode += 1
                    episode_rewards.append(info['reward'])
                    writer.add_scalar('reward/episode', info['reward'],
                                      i_episode)
                    #print("E:",i_episode," T:",info['episode_steps'], " R:", info['episode_reward'], " D:",info['distToTarget'])
                if 'steps' in info.keys():
                    episode_steps.append(info['steps'])
                    writer.add_scalar('reward/steps', info['steps'], i_episode)
                if 'alive' in info.keys():
                    episode_rewards_alive.append(info['alive'])
                    writer.add_scalar('reward/alive', info['alive'], i_episode)
                if 'prog' in info.keys():
                    episode_rewards_progress.append(info['prog'])
                    writer.add_scalar('reward/progress', info['prog'],
                                      i_episode)
                if 'servo' in info.keys():
                    episode_rewards_servo.append(info['servo'])
                    writer.add_scalar('reward/servo', info['servo'], i_episode)
                if 'd2T' in info.keys():
                    episode_dist_to_target.append(info['d2T'])
                    writer.add_scalar('reward/distToTarget', info['d2T'],
                                      i_episode)

                for val in info.keys():
                    if val not in [
                            "reward", "steps", "alive", "prog", "servo", "d2T",
                            'epos', 't'
                    ]:
                        writer.add_scalar('reward/' + val, info[val],
                                          i_episode)

            #if episodeDone and i_episode%10==0:
            #    print(i_episode,"({:.1f}/{}/{:.2f}) ".format(episode_rewards[-1],episode_steps[-1],episode_dist_to_target[-1]),end='',flush=True)

            if episodeDone:
                episodeBucketIndex += 1
                if args.verboseLevel > 0:
                    print("Mean:", Fore.WHITE, np.mean(episode_rewards),
                          Style.RESET_ALL, " Median:", Fore.WHITE,
                          np.median(episode_rewards), Style.RESET_ALL,
                          " max reward:", maxReward)

                #'''len(episode_rewards) and np.mean(episode_rewards)>maxReward and'''
                if realEval:
                    if episodeBucketIndex % args.log_interval == 0 and episodeBucketIndex > args.log_interval:
                        print("Step:",
                              (j + 1) * args.num_processes * args.num_steps)
                        if skipWriteBest == False:
                            evalReward = evaluate_policy(
                                envEval,
                                actor_critic,
                                numEval,
                                device=device,
                                verbose=args.verboseLevel)

                            writer.add_scalar('reward/eval', evalReward,
                                              i_episode)

                            if evalReward > maxReward:
                                maxReward = evalReward
                                #maxReward = np.mean(episode_rewards)

                                bestFilename = os.path.join(
                                    save_path, "{}_{}{}_best.pt".format(
                                        args.env_name, filesNamesSuffix,
                                        args.hidden_size))
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{:.1f}/{}/{:.2f}) ".format(
                                        maxReward, np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps),
                                        episode_dist_to_target[-1]),
                                    Style.RESET_ALL, bestFilename)
                                torch.save([
                                    actor_critic,
                                    getattr(utils.get_vec_normalize(envs),
                                            'ob_rms', None)
                                ], bestFilename)
                                noMaxRewardCount = 0
                            else:
                                noMaxRewardCount += 1
                                if allowMutate:
                                    if noMaxRewardCount == 5:
                                        print("Mutation low last layer")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.00,
                                               powerLast=0.01)
                                    if noMaxRewardCount == 8:
                                        print("Mutation low non last")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.01,
                                               powerLast=0.0)
                                    if noMaxRewardCount == 11:
                                        print("Mutation low all")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.02,
                                               powerLast=0.2)
                                    if noMaxRewardCount == 14:
                                        print("Mutation hi all")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.03,
                                               powerLast=0.03)
                                        noMaxRewardCount = 0
                                if noMaxRewardCount == args.nobest_exit:
                                    exit(0)
                        else:
                            skipWriteBest = False
                else:
                    if len(episode_rewards) and np.mean(
                            episode_rewards
                    ) > maxReward and j > args.log_interval:
                        if skipWriteBest == False:
                            maxReward = np.mean(episode_rewards)
                            writer.add_scalar('reward/maxReward', maxReward,
                                              i_episode)

                            bestFilename = os.path.join(
                                save_path, "{}_{}{}_best.pt".format(
                                    args.env_name, filesNamesSuffix,
                                    args.hidden_size))
                            if len(episode_dist_to_target):
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{}/{:.2f}) ".format(
                                        np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps),
                                        episode_dist_to_target[-1]),
                                    Style.RESET_ALL, bestFilename)
                            else:
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{}) ".format(
                                        np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps)),
                                    Style.RESET_ALL, bestFilename)

                            torch.save([
                                actor_critic,
                                getattr(utils.get_vec_normalize(envs),
                                        'ob_rms', None)
                            ], bestFilename)
                        else:
                            skipWriteBest = False
            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            shaped_reward = reward_shaper(reward)
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, shaped_reward, masks,
                            bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        writer.add_scalar('reward/value_loss', value_loss, updateIndex)
        writer.add_scalar('reward/action_loss', action_loss, updateIndex)
        writer.add_scalar('reward/dist_entropy', dist_entropy, updateIndex)

        updateIndex += 1

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            '''
            fileName = os.path.join(save_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], fileName)
            print("Saved:",fileName, " cur avg rewards:",np.mean(episode_rewards))

            fileName = os.path.join(save_path, "{}_{}{}_actor.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
            torch.save(actor_critic.state_dict, fileName)
            print("Saved:",fileName)
            '''
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            if args.verboseLevel > 0:
                print("")
                print("Updates {}, num timesteps {}, FPS {}".format(
                    j, total_num_steps, int(total_num_steps / (end - start))))
                print(" Last {} training episodes:".format(
                    len(episode_rewards)))

                print(
                    " reward mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}".
                    format(np.mean(episode_rewards),
                           np.median(episode_rewards), np.min(episode_rewards),
                           np.max(episode_rewards)))

                print(" steps mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}".
                      format(np.mean(episode_steps), np.median(episode_steps),
                             np.min(episode_steps), np.max(episode_steps)))

                if len(episode_rewards_alive):
                    print(
                        " alive mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_alive),
                                np.median(episode_rewards_alive),
                                np.min(episode_rewards_alive),
                                np.max(episode_rewards_alive)))

                if len(episode_rewards_progress):
                    print(
                        " progress mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_progress),
                                np.median(episode_rewards_progress),
                                np.min(episode_rewards_progress),
                                np.max(episode_rewards_progress)))

                if len(episode_rewards_servo):
                    print(
                        " servo mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_servo),
                                np.median(episode_rewards_servo),
                                np.min(episode_rewards_servo),
                                np.max(episode_rewards_servo)))

                if len(episode_dist_to_target):
                    print(
                        " dist to target mean/median {:.3f}/{:.3f} min/max {:.3f}/{:.3f}"
                        .format(np.mean(episode_dist_to_target),
                                np.median(episode_dist_to_target),
                                np.min(episode_dist_to_target),
                                np.max(episode_dist_to_target)))

                print(
                    " Reward/Steps {:.3f} Progress/Steps: {:.3f} entropy {:.1f} value_loss {:.5f} action_loss {:.5f}\n"
                    .format(
                        np.mean(episode_rewards) / np.mean(episode_steps),
                        (0 if len(episode_rewards_progress) == 0 else
                         np.mean(episode_rewards_progress) /
                         np.mean(episode_steps)), dist_entropy, value_loss,
                        action_loss))
Пример #3
0
def main():
    args = get_args()
    import random
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    logdir = args.env_name + '_' + args.algo + '_num_arms_' + str(
        args.num_processes) + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    if args.use_privacy:
        logdir = logdir + '_privacy'
    elif args.use_noisygrad:
        logdir = logdir + '_noisygrad'
    elif args.use_pcgrad:
        logdir = logdir + '_pcgrad'
    elif args.use_testgrad:
        logdir = logdir + '_testgrad'
    elif args.use_median_grad:
        logdir = logdir + '_mediangrad'
    logdir = os.path.join('runs', logdir)
    logdir = os.path.join(os.path.expanduser(args.log_dir), logdir)
    utils.cleanup_log_dir(logdir)

    # Ugly but simple logging
    log_dict = {
        'task_steps': args.task_steps,
        'grad_noise_ratio': args.grad_noise_ratio,
        'max_task_grad_norm': args.max_task_grad_norm,
        'use_noisygrad': args.use_noisygrad,
        'use_pcgrad': args.use_pcgrad,
        'use_testgrad': args.use_testgrad,
        'use_testgrad_median': args.use_testgrad_median,
        'testgrad_quantile': args.testgrad_quantile,
        'median_grad': args.use_median_grad,
        'use_meanvargrad': args.use_meanvargrad,
        'meanvar_beta': args.meanvar_beta,
        'no_special_grad_for_critic': args.no_special_grad_for_critic,
        'use_privacy': args.use_privacy,
        'seed': args.seed,
        'recurrent': args.recurrent_policy,
        'obs_recurrent': args.obs_recurrent,
        'cmd': ' '.join(sys.argv[1:])
    }
    for eval_disp_name, eval_env_name in EVAL_ENVS.items():
        log_dict[eval_disp_name] = []

    summary_writer = SummaryWriter()
    summary_writer.add_hparams(
        {
            'task_steps': args.task_steps,
            'grad_noise_ratio': args.grad_noise_ratio,
            'max_task_grad_norm': args.max_task_grad_norm,
            'use_noisygrad': args.use_noisygrad,
            'use_pcgrad': args.use_pcgrad,
            'use_testgrad': args.use_testgrad,
            'use_testgrad_median': args.use_testgrad_median,
            'testgrad_quantile': args.testgrad_quantile,
            'median_grad': args.use_median_grad,
            'use_meanvargrad': args.use_meanvargrad,
            'meanvar_beta': args.meanvar_beta,
            'no_special_grad_for_critic': args.no_special_grad_for_critic,
            'use_privacy': args.use_privacy,
            'seed': args.seed,
            'recurrent': args.recurrent_policy,
            'obs_recurrent': args.obs_recurrent,
            'cmd': ' '.join(sys.argv[1:])
        }, {})

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    print('making envs...')
    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         device,
                         False,
                         steps=args.task_steps,
                         free_exploration=args.free_exploration,
                         recurrent=args.recurrent_policy,
                         obs_recurrent=args.obs_recurrent,
                         multi_task=True)

    val_envs = make_vec_envs(args.val_env_name,
                             args.seed,
                             args.num_processes,
                             args.gamma,
                             args.log_dir,
                             device,
                             False,
                             steps=args.task_steps,
                             free_exploration=args.free_exploration,
                             recurrent=args.recurrent_policy,
                             obs_recurrent=args.obs_recurrent,
                             multi_task=True)

    eval_envs_dic = {}
    for eval_disp_name, eval_env_name in EVAL_ENVS.items():
        eval_envs_dic[eval_disp_name] = make_vec_envs(
            eval_env_name[0],
            args.seed,
            args.num_processes,
            None,
            logdir,
            device,
            True,
            steps=args.task_steps,
            recurrent=args.recurrent_policy,
            obs_recurrent=args.obs_recurrent,
            multi_task=True,
            free_exploration=args.free_exploration)
    prev_eval_r = {}
    print('done')
    if args.hard_attn:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base=MLPHardAttnBase,
                              base_kwargs={
                                  'recurrent':
                                  args.recurrent_policy or args.obs_recurrent
                              })
    else:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base=MLPAttnBase,
                              base_kwargs={
                                  'recurrent':
                                  args.recurrent_policy or args.obs_recurrent
                              })
    actor_critic.to(device)

    if (args.continue_from_epoch > 0) and args.save_dir != "":
        save_path = os.path.join(args.save_dir, args.algo)
        actor_critic_, loaded_obs_rms_ = torch.load(
            os.path.join(
                save_path, args.env_name +
                "-epoch-{}.pt".format(args.continue_from_epoch)))
        actor_critic.load_state_dict(actor_critic_.state_dict())

    if args.algo != 'ppo':
        raise "only PPO is supported"
    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     num_tasks=args.num_processes,
                     attention_policy=False,
                     max_grad_norm=args.max_grad_norm,
                     weight_decay=args.weight_decay)
    val_agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.val_lr,
                         eps=args.eps,
                         num_tasks=args.num_processes,
                         attention_policy=True,
                         max_grad_norm=args.max_grad_norm,
                         weight_decay=args.weight_decay)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    val_rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                  val_envs.observation_space.shape,
                                  val_envs.action_space,
                                  actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    val_obs = val_envs.reset()
    val_rollouts.obs[0].copy_(val_obs)
    val_rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    save_copy = True
    for j in range(args.continue_from_epoch,
                   args.continue_from_epoch + num_updates):

        # policy rollouts
        for step in range(args.num_steps):
            # Sample actions
            actor_critic.eval()
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            actor_critic.train()

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    for k, v in info['episode'].items():
                        summary_writer.add_scalar(
                            f'training/{k}', v,
                            j * args.num_processes * args.num_steps +
                            args.num_processes * step)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        actor_critic.eval()
        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()
        actor_critic.train()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        if save_copy:
            prev_weights = copy.deepcopy(actor_critic.state_dict())
            prev_opt_state = copy.deepcopy(agent.optimizer.state_dict())
            prev_val_opt_state = copy.deepcopy(
                val_agent.optimizer.state_dict())
            save_copy = False

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # validation rollouts
        for val_iter in range(args.val_agent_steps):
            for step in range(args.num_steps):
                # Sample actions
                actor_critic.eval()
                with torch.no_grad():
                    value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        val_rollouts.obs[step],
                        val_rollouts.recurrent_hidden_states[step],
                        val_rollouts.masks[step])
                actor_critic.train()

                # Obser reward and next obs
                obs, reward, done, infos = val_envs.step(action)

                # If done then clean the history of observations.
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])
                bad_masks = torch.FloatTensor(
                    [[0.0] if 'bad_transition' in info.keys() else [1.0]
                     for info in infos])
                val_rollouts.insert(obs, recurrent_hidden_states, action,
                                    action_log_prob, value, reward, masks,
                                    bad_masks)

            actor_critic.eval()
            with torch.no_grad():
                next_value = actor_critic.get_value(
                    val_rollouts.obs[-1],
                    val_rollouts.recurrent_hidden_states[-1],
                    val_rollouts.masks[-1]).detach()
            actor_critic.train()

            val_rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                         args.gae_lambda,
                                         args.use_proper_time_limits)

            val_value_loss, val_action_loss, val_dist_entropy = val_agent.update(
                val_rollouts)
            val_rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'obs_rms', None)
            ], os.path.join(save_path,
                            args.env_name + "-epoch-{}.pt".format(j)))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
        revert = False
        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            actor_critic.eval()
            obs_rms = utils.get_vec_normalize(envs).obs_rms
            eval_r = {}
            printout = f'Seed {args.seed} Iter {j} '
            for eval_disp_name, eval_env_name in EVAL_ENVS.items():
                eval_r[eval_disp_name] = evaluate(
                    actor_critic,
                    obs_rms,
                    eval_envs_dic,
                    eval_disp_name,
                    args.seed,
                    args.num_processes,
                    eval_env_name[1],
                    logdir,
                    device,
                    steps=args.task_steps,
                    recurrent=args.recurrent_policy,
                    obs_recurrent=args.obs_recurrent,
                    multi_task=True,
                    free_exploration=args.free_exploration)
                if eval_disp_name in prev_eval_r:
                    diff = np.array(eval_r[eval_disp_name]) - np.array(
                        prev_eval_r[eval_disp_name])
                    if eval_disp_name == 'many_arms':
                        if np.sum(diff > 0) - np.sum(
                                diff < 0) < args.val_improvement_threshold:
                            print('no update')
                            revert = True

                summary_writer.add_scalar(f'eval/{eval_disp_name}',
                                          np.mean(eval_r[eval_disp_name]),
                                          (j + 1) * args.num_processes *
                                          args.num_steps)
                log_dict[eval_disp_name].append([
                    (j + 1) * args.num_processes * args.num_steps,
                    eval_r[eval_disp_name]
                ])
                printout += eval_disp_name + ' ' + str(
                    np.mean(eval_r[eval_disp_name])) + ' '
            # summary_writer.add_scalars('eval_combined', eval_r, (j+1) * args.num_processes * args.num_steps)
            if revert:
                actor_critic.load_state_dict(prev_weights)
                agent.optimizer.load_state_dict(prev_opt_state)
                val_agent.optimizer.load_state_dict(prev_val_opt_state)
            else:
                print(printout)
                prev_eval_r = eval_r.copy()
            save_copy = True
            actor_critic.train()

    save_obj(log_dict, os.path.join(logdir, 'log_dict.pkl'))
    envs.close()
    val_envs.close()
    for eval_disp_name, eval_env_name in EVAL_ENVS.items():
        eval_envs_dic[eval_disp_name].close()
Пример #4
0
if not os.path.isfile(args.model_path):
    print_error('Model file does not exist')

torch.manual_seed(0)
torch.set_num_threads(1)
device = torch.device('cpu')

render_env = gym.make(args.env_name, args = args)
render_env.seed(0)

envs = make_vec_envs(args.env_name, 0, 4, 0.995, None, device, False, args = args)

actor_critic = Policy(
    envs.observation_space.shape,
    envs.action_space,
    base_kwargs={'recurrent': False})
actor_critic.to(device)

ob_rms = utils.get_vec_normalize(envs).ob_rms

actor_critic, ob_rms = torch.load(args.model_path)

actor_critic.eval()

envs.close()

render_full(render_env, actor_critic, ob_rms, deterministic = True, repeat = True)


Пример #5
0
                                        record_video_filename),
                           force=True)

policy = Policy(env.observation_space.shape,
                env.action_space,
                base_kwargs={
                    'recurrent': False,
                    'layernorm': args.layernorm
                },
                obj_num=env.obj_dim)

state_dict = torch.load(state_dict_path)
policy.load_state_dict(state_dict)
policy = policy.to(device)
policy.double()
policy.eval()

ob_rms = None
if args.env_params is not None and os.path.exists(args.env_params):
    with open(args.env_params, 'rb') as fp:
        env_params = pickle.load(fp)
    ob_rms = env_params['ob_rms']

while True:
    obs = env.reset()
    obj = np.zeros(env.obj_dim)
    t = time()
    done = False
    iter = 0
    while not done:
        if ob_rms is not None: