Exemplo n.º 1
0
def main():

    realEval = True  #False

    gettrace = getattr(sys, 'gettrace', None)

    parser = argparse.ArgumentParser(description='RL')
    parser.add_argument('--action-type',
                        type=int,
                        default=-1,
                        help='action type to play (default: -1)')

    parser.add_argument('--tasks-difficulty-from',
                        type=int,
                        default=0,
                        help='tasks_difficulty_from')

    parser.add_argument('--tasks-difficulty-to',
                        type=int,
                        default=100000,
                        help='tasks-difficulty-to')

    parser.add_argument('--verboseLevel',
                        type=int,
                        default=5,
                        help='verboseLevel')

    parser.add_argument('--filesNamesSuffix',
                        default="",
                        help='filesNamesSuffix')

    parser.add_argument('--nobest-exit',
                        type=int,
                        default=10000,
                        help='nobest_exit')

    args = get_args(parser)

    args.algo = 'ppo'
    args.env_name = 'QuadruppedWalk-v1'  #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1'
    args.use_gae = True
    args.num_steps = 2048
    #args.num_processes = 4
    args.num_processes = 4
    if gettrace():
        args.num_processes = 1
    args.lr = 0.0001
    args.entropy_coef = 0.0
    args.value_loss_coef = 0.5
    args.ppo_epoch = 4
    args.num_mini_batch = 256
    args.gamma = 0.99
    args.gae_lambda = 0.95
    args.clip_param = 0.2
    args.use_linear_lr_decay = True  #True #True #True #True
    args.use_proper_time_limits = True
    args.save_dir = "./trained_models/" + args.env_name + "/"
    args.load_dir = "./trained_models/" + args.env_name + "/"
    args.log_dir = "./logs/robot"
    if gettrace():
        args.save_dir = "./trained_models/" + args.env_name + "debug/"
        args.load_dir = "./trained_models/" + args.env_name + "debug/"
        args.log_dir = "./logs/robot_d"
    args.log_interval = 30
    args.hidden_size = 64
    args.last_hidden_size = args.hidden_size
    args.recurrent_policy = False  #True
    args.save_interval = 20
    #args.seed = 1
    reward_shaping = 0.01
    allowMutate = False

    if args.seed == -1:
        args.seed = time.clock_gettime_ns(time.CLOCK_REALTIME)

    quadruppedEnv.settings.tasks_difficulty_from = args.tasks_difficulty_from
    quadruppedEnv.settings.tasks_difficulty_to = args.tasks_difficulty_to

    # 0 is a walk
    # 1 is a balance
    # 2 multitasks
    # 3 multitask experiments
    trainType = 14
    filesNamesSuffix = ""
    if args.action_type >= 0:
        trainType = args.action_type

    makeEnvFunction = makeEnv.make_env_with_best_settings
    if trainType == 1:
        filesNamesSuffix = "balance_"
        makeEnvFunction = makeEnv.make_env_for_balance

    if trainType == 2:
        filesNamesSuffix = "analytical_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical

    if trainType == 3:
        filesNamesSuffix = "analytical2_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical2

    if trainType == 4:
        filesNamesSuffix = "frontback_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_front_back

    if trainType == 5:
        filesNamesSuffix = "leftright_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_left_right

    if trainType == 6:
        filesNamesSuffix = "all_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all

    if trainType == 7:
        filesNamesSuffix = "rotate_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_rotate

    if trainType == 8:
        filesNamesSuffix = "compound_"
        makeEnvFunction = make_env_multinetwork

    if trainType == 9:
        import pickle
        realEval = False
        allowMutate = False
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 5000000
        filesNamesSuffix = "test_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_test

    if trainType == 10:
        import pickle
        realEval = False
        allowMutate = False
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 5000000
        filesNamesSuffix = "zoo_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_test_zoo

    if trainType == 11:
        args.hidden_size = 128  #64 #128
        args.last_hidden_size = args.hidden_size

        import pickle
        if gettrace():
            args.num_processes = 1
        else:
            args.num_processes = 8
        realEval = False
        allowMutate = False
        args.lr = 0.00001
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 10000000
        filesNamesSuffix = "zigote2_updown_"
        print("Samples preload")
        global samplesEnvData
        samplesEnvData = pickle.load(
            open("./QuadruppedWalk-v1_MoveNoPhys.samples", "rb"))
        # samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1.samples", "rb" ) )
        makeEnvFunction = makeSamplesEnv

    if trainType == 12:
        import pickle
        args.lr = 0.00001
        args.hidden_size = 64
        args.last_hidden_size = args.hidden_size
        filesNamesSuffix = "zigote2_front_back_"
        args.clip_param = 0.9
        args.value_loss_coef = 0.9
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_train
        #makeEnvFunction = makeEnv.make_env_with_best_settings_for_record
        #makeEnv.samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1_MoveNoPhys.samples", "rb" ) )

    if trainType == 13:
        filesNamesSuffix = "all_bytasks_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all

    if trainType == 14:
        #args.lr = 0.00001
        #args.num_env_steps = 000000
        #args.clip_param = 0.5
        #args.value_loss_coef  =0.8
        #random.seed(time.clock_gettime_ns(time.CLOCK_REALTIME))
        #args.num_steps = random.choice([256,512,1024,2048,4096])
        #args.num_mini_batch = random.choice([32,64,256,512])
        #args.ppo_epoch  = random.choice([2,4,8,10])
        #args.clip_param = random.choice([0.2,0.4,0.6,0.8])
        #args.value_loss_coef  =random.choice([0.4,0.5,0.6,0.8])
        #args.lr = random.choice([0.00001,0.0001,0.00005,0.0005])

        args.num_steps = 2048
        args.num_mini_batch = 64
        args.ppo_epoch = 8
        args.lr = 0.0001

        args.hidden_size = 64
        args.last_hidden_size = args.hidden_size
        #
        filesNamesSuffix = args.filesNamesSuffix
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all
        '''
        num_steps: 1024 num_mini_batch 64 ppo_epoch 2
        clip_param: 0.2 value_loss_coef 0.6 lr 0.0001
        '''

    if trainType == 15:
        args.num_env_steps = 5000000
        filesNamesSuffix = "zigote_updown_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_train_analytic

    if trainType == 16:
        args.lr = 0.00001
        filesNamesSuffix = "compound_tasks_"
        makeEnvFunction = make_env_multinetwork

    reward_shaper = DefaultRewardsShaper(scale_value=reward_shaping)

    print("ActionType ", trainType, " ", filesNamesSuffix, "seed", args.seed,
          "num env steps:", args.num_env_steps, " tasks_dif",
          args.tasks_difficulty_from, args.tasks_difficulty_to)

    print("Num processes:", args.num_processes)

    print("num_steps:", args.num_steps, "num_mini_batch", args.num_mini_batch,
          "ppo_epoch", args.ppo_epoch)
    print("clip_param:", args.clip_param, "value_loss_coef",
          args.value_loss_coef, "lr", args.lr)

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    args.log_dir = "/tmp/tensorboard/"
    #TesnorboardX
    writer = SummaryWriter(log_dir=args.log_dir + 'runs/{}_PPO_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
        "ppo"))

    writer.add_scalar('options/num_steps', args.num_steps, 0)
    writer.add_scalar('options/num_mini_batch', args.num_mini_batch, 0)
    writer.add_scalar('options/ppo_epoch', args.ppo_epoch, 0)
    writer.add_scalar('options/clip_param', args.clip_param, 0)
    writer.add_scalar('options/value_loss_coef', args.value_loss_coef, 0)
    writer.add_scalar('options/lr', args.lr, 0)

    device = torch.device("cuda:0" if args.cuda else "cpu")
    torch.set_num_threads(1)

    load_dir = os.path.join(args.load_dir, args.algo)

    multiNetworkName = ["frontback_", "all_", "leftright_", "rotate_"]
    if trainType == 8:
        for net in multiNetworkName:
            bestFilename = os.path.join(
                load_dir, "{}_{}{}_best.pt".format(args.env_name, net,
                                                   args.hidden_size))
            ac, _ = torch.load(bestFilename)
            policies.append(PPOPlayer(ac, device))
            print("Policy multi loaded: ", bestFilename)

    multiNetworkName2 = [
        "all_bytasks_0_",
        "all_bytasks_1_",
        "all_bytasks_2_",
        "all_bytasks_3_",
        "all_bytasks_4_",
        "all_bytasks_5_",
        "all_bytasks_6_",
        "all_bytasks_7_",
        "all_bytasks_8_",
        "all_bytasks_9_",
        "all_bytasks_10_",
        "all_bytasks_11_",
        "all_bytasks_12_",
    ]
    if trainType == 16:
        for net in multiNetworkName2:
            bestFilename = os.path.join(
                load_dir, "{}_{}{}_best.pt".format(args.env_name, net,
                                                   args.hidden_size))
            ac, _ = torch.load(bestFilename)
            policies.append(PPOPlayer(ac, device))
            print("Policy multi loaded: ", bestFilename)

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         None,
                         device,
                         False,
                         normalizeOb=False,
                         normalizeReturns=False,
                         max_episode_steps=args.num_steps,
                         makeEnvFunc=makeEnvFunction,
                         num_frame_stack=1,
                         info_keywords=(
                             'episode_steps',
                             'episode_reward',
                             'progress',
                             'servo',
                             'distToTarget',
                         ))
    #print(envs.observation_space.shape,envs.action_space)
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'hidden_size': args.hidden_size,
                              'last_hidden_size': args.last_hidden_size,
                              'activation_layers_type': "Tanh"
                          })
    '''
#    if args.load_dir not None:
    load_path = os.path.join(args.load_dir, args.algo)
    actor_critic, ob_rms = torch.load(os.path.join(load_path, args.env_name + ".pt"))
    '''
    load_path = os.path.join(
        load_dir, "{}_{}{}_best.pt".format(args.env_name, filesNamesSuffix,
                                           args.hidden_size))
    #load_path = os.path.join(load_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
    preptrained_path = "../Train/trained_models/QuadruppedWalk-v1/Train_QuadruppedWalk-v1_256.pth"
    loadPretrained = False
    if loadPretrained and os.path.isfile(preptrained_path):
        print("Load preptrained")
        abj = torch.load(preptrained_path)
        print(abj)
        print(actor_critic.base)
        actor_critic.base.load_state_dict()
        actor_critic.base.eval()
    if os.path.isfile(load_path) and not loadPretrained:
        actor_critic, ob_rms = torch.load(load_path)
        actor_critic.eval()
        print("----NN loaded: ", load_path, " -----")
    else:
        bestFilename = os.path.join(
            load_dir,
            "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix,
                                              args.hidden_size))
        if os.path.isfile(bestFilename):
            actor_critic, ob_rms = torch.load(bestFilename)
            actor_critic.eval()
            print("----NN loaded: ", bestFilename, " -----")

    maxReward = -10000.0
    maxSteps = 0
    minDistance = 50000.0

    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    deque_maxLen = 10

    episode_rewards = deque(maxlen=deque_maxLen)
    episode_steps = deque(maxlen=deque_maxLen)
    episode_rewards_alive = deque(maxlen=deque_maxLen)
    episode_rewards_progress = deque(maxlen=deque_maxLen)
    episode_rewards_servo = deque(maxlen=deque_maxLen)
    episode_dist_to_target = deque(maxlen=deque_maxLen)
    '''
    load_path = os.path.join(args.load_dir, args.algo)
    load_path = os.path.join(load_path, args.env_name + ".pt")
    actor_critic, ob_rms = torch.load(load_path)

    actor_critic.to(device)
    actor_critic.eval()
    #ob_rms.eval()
    '''
    '''
    args.use_gym_monitor = 1
    args.monitor_dir = "./results/"
    monitor_path = os.path.join(args.monitor_dir, args.algo)
    monitor_path = os.path.join(monitor_path, args.env_name)

    args.
    if args.use_gym_monitor:
        env = wrappers.Monitor(
            env, monitor_path, video_callable=False, force=True)
    '''
    i_episode = 0

    save_path = os.path.join(args.save_dir, args.algo)
    try:
        os.makedirs(save_path)
    except OSError:
        pass

    trainOnSamplesAndExit = False  #False
    if trainOnSamplesAndExit:
        import pickle
        print("---------------------------------------")
        print("Samples preload")
        data = pickle.load(open("./QuadruppedWalk-v1_UpDown.samples", "rb"))
        #data = pickle.load( open( "../QuadruppedWalk-v1_NN.samples", "rb" ) )

        learning_rate = 0.0001
        max_episodes = 100
        max_timesteps = 4000
        betas = (0.9, 0.999)
        log_interval = 1

        envSamples = SamplesEnv(data)
        envSamples.numSteps = max_timesteps

        # create a stochastic gradient descent optimizer
        optimizer = torch.optim.Adam(actor_critic.base.actor.parameters(),
                                     lr=learning_rate,
                                     betas=betas)
        #optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
        # create a loss function
        criterion = nn.MSELoss(reduction="sum")

        # run the main training loop
        for epoch in range(max_episodes):
            state = envSamples.reset()
            time_step = 0
            testReward = 0
            testSteps = 0
            loss_sum = 0
            loss_max = 0

            for t in range(max_timesteps):
                time_step += 1

                nn_state = torch.FloatTensor((state).reshape(1, -1)).to(device)

                optimizer.zero_grad()
                net_out = actor_critic.base.forwardActor(nn_state)
                net_out = actor_critic.dist.fc_mean(net_out)

                state, reward, done, info = envSamples.step(
                    net_out.detach().numpy())
                sim_action = envSamples.recordedActions

                sim_action_t = torch.FloatTensor([sim_action]).to(device)

                loss = criterion(net_out, sim_action_t)
                loss.backward()
                optimizer.step()
                loss_sum += loss.mean()
                loss_max = max(loss_max, loss.max())

                testReward += reward
                testSteps += 1

                if done:
                    if epoch % log_interval == 0:
                        #print(best_action_t*scaleActions-net_out*scaleActions)
                        if args.verboseLevel > 0:
                            print(
                                'Train Episode: {} t:{} Reward:{} Loss: mean:{:.6f} max: {:.6f}'
                                .format(epoch, t, testReward, loss_sum / t,
                                        loss_max))
                            print(info)
                        reward = 0
                    break
        bestFilename = os.path.join(
            save_path,
            "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix,
                                              args.hidden_size))
        torch.save([
            actor_critic,
            getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
        ], bestFilename)
        exit(0)

    skipWriteBest = True

    if args.verboseLevel > 0:
        printNetwork(actor_critic.base.actor)

    lock(actor_critic, first=False, last=False)
    #if trainType==9:
    #allowMutate = False
    #lock(actor_critic,first=True,last=False)
    #mutate(actor_critic,power=0.00,powerLast=0.3)

    if args.verboseLevel > 0:
        printNetwork(actor_critic.base.actor)
    #from torchsummary import summary

    #summary(actor_critic.base.actor, (1, 48, 64))

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    episodeBucketIndex = 0

    maxReward = -10000000000
    numEval = 10
    if realEval:
        envEval = makeEnvFunction(args.env_name)
        if hasattr(envEval.env, "tasks") and len(envEval.env.tasks):
            numEval = max(numEval, len(envEval.env.tasks))
        maxReward = evaluate_policy(envEval,
                                    actor_critic,
                                    numEval * 2,
                                    render=False,
                                    device=device,
                                    verbose=args.verboseLevel)
        print("MaxReward on start", maxReward)

    noMaxRewardCount = 0

    updateIndex = 0

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        episode_r = 0.0
        stepsDone = 0

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            #envs.venv.venv.venv.envs[0].render()

            if args.verboseLevel > 0:
                index = 0
                for d in done:
                    if d:
                        print(infos[index], flush=True)
                    index += 1

            episodeDone = False
            '''
            index = 0
            for d in done:
                if d:
                    print("")
                    print(infos[index])
                index+=1
            '''

            for info in infos:
                if 'reward' in info.keys():
                    episodeDone = True
                    i_episode += 1
                    episode_rewards.append(info['reward'])
                    writer.add_scalar('reward/episode', info['reward'],
                                      i_episode)
                    #print("E:",i_episode," T:",info['episode_steps'], " R:", info['episode_reward'], " D:",info['distToTarget'])
                if 'steps' in info.keys():
                    episode_steps.append(info['steps'])
                    writer.add_scalar('reward/steps', info['steps'], i_episode)
                if 'alive' in info.keys():
                    episode_rewards_alive.append(info['alive'])
                    writer.add_scalar('reward/alive', info['alive'], i_episode)
                if 'prog' in info.keys():
                    episode_rewards_progress.append(info['prog'])
                    writer.add_scalar('reward/progress', info['prog'],
                                      i_episode)
                if 'servo' in info.keys():
                    episode_rewards_servo.append(info['servo'])
                    writer.add_scalar('reward/servo', info['servo'], i_episode)
                if 'd2T' in info.keys():
                    episode_dist_to_target.append(info['d2T'])
                    writer.add_scalar('reward/distToTarget', info['d2T'],
                                      i_episode)

                for val in info.keys():
                    if val not in [
                            "reward", "steps", "alive", "prog", "servo", "d2T",
                            'epos', 't'
                    ]:
                        writer.add_scalar('reward/' + val, info[val],
                                          i_episode)

            #if episodeDone and i_episode%10==0:
            #    print(i_episode,"({:.1f}/{}/{:.2f}) ".format(episode_rewards[-1],episode_steps[-1],episode_dist_to_target[-1]),end='',flush=True)

            if episodeDone:
                episodeBucketIndex += 1
                if args.verboseLevel > 0:
                    print("Mean:", Fore.WHITE, np.mean(episode_rewards),
                          Style.RESET_ALL, " Median:", Fore.WHITE,
                          np.median(episode_rewards), Style.RESET_ALL,
                          " max reward:", maxReward)

                #'''len(episode_rewards) and np.mean(episode_rewards)>maxReward and'''
                if realEval:
                    if episodeBucketIndex % args.log_interval == 0 and episodeBucketIndex > args.log_interval:
                        print("Step:",
                              (j + 1) * args.num_processes * args.num_steps)
                        if skipWriteBest == False:
                            evalReward = evaluate_policy(
                                envEval,
                                actor_critic,
                                numEval,
                                device=device,
                                verbose=args.verboseLevel)

                            writer.add_scalar('reward/eval', evalReward,
                                              i_episode)

                            if evalReward > maxReward:
                                maxReward = evalReward
                                #maxReward = np.mean(episode_rewards)

                                bestFilename = os.path.join(
                                    save_path, "{}_{}{}_best.pt".format(
                                        args.env_name, filesNamesSuffix,
                                        args.hidden_size))
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{:.1f}/{}/{:.2f}) ".format(
                                        maxReward, np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps),
                                        episode_dist_to_target[-1]),
                                    Style.RESET_ALL, bestFilename)
                                torch.save([
                                    actor_critic,
                                    getattr(utils.get_vec_normalize(envs),
                                            'ob_rms', None)
                                ], bestFilename)
                                noMaxRewardCount = 0
                            else:
                                noMaxRewardCount += 1
                                if allowMutate:
                                    if noMaxRewardCount == 5:
                                        print("Mutation low last layer")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.00,
                                               powerLast=0.01)
                                    if noMaxRewardCount == 8:
                                        print("Mutation low non last")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.01,
                                               powerLast=0.0)
                                    if noMaxRewardCount == 11:
                                        print("Mutation low all")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.02,
                                               powerLast=0.2)
                                    if noMaxRewardCount == 14:
                                        print("Mutation hi all")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.03,
                                               powerLast=0.03)
                                        noMaxRewardCount = 0
                                if noMaxRewardCount == args.nobest_exit:
                                    exit(0)
                        else:
                            skipWriteBest = False
                else:
                    if len(episode_rewards) and np.mean(
                            episode_rewards
                    ) > maxReward and j > args.log_interval:
                        if skipWriteBest == False:
                            maxReward = np.mean(episode_rewards)
                            writer.add_scalar('reward/maxReward', maxReward,
                                              i_episode)

                            bestFilename = os.path.join(
                                save_path, "{}_{}{}_best.pt".format(
                                    args.env_name, filesNamesSuffix,
                                    args.hidden_size))
                            if len(episode_dist_to_target):
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{}/{:.2f}) ".format(
                                        np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps),
                                        episode_dist_to_target[-1]),
                                    Style.RESET_ALL, bestFilename)
                            else:
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{}) ".format(
                                        np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps)),
                                    Style.RESET_ALL, bestFilename)

                            torch.save([
                                actor_critic,
                                getattr(utils.get_vec_normalize(envs),
                                        'ob_rms', None)
                            ], bestFilename)
                        else:
                            skipWriteBest = False
            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            shaped_reward = reward_shaper(reward)
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, shaped_reward, masks,
                            bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        writer.add_scalar('reward/value_loss', value_loss, updateIndex)
        writer.add_scalar('reward/action_loss', action_loss, updateIndex)
        writer.add_scalar('reward/dist_entropy', dist_entropy, updateIndex)

        updateIndex += 1

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            '''
            fileName = os.path.join(save_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], fileName)
            print("Saved:",fileName, " cur avg rewards:",np.mean(episode_rewards))

            fileName = os.path.join(save_path, "{}_{}{}_actor.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
            torch.save(actor_critic.state_dict, fileName)
            print("Saved:",fileName)
            '''
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            if args.verboseLevel > 0:
                print("")
                print("Updates {}, num timesteps {}, FPS {}".format(
                    j, total_num_steps, int(total_num_steps / (end - start))))
                print(" Last {} training episodes:".format(
                    len(episode_rewards)))

                print(
                    " reward mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}".
                    format(np.mean(episode_rewards),
                           np.median(episode_rewards), np.min(episode_rewards),
                           np.max(episode_rewards)))

                print(" steps mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}".
                      format(np.mean(episode_steps), np.median(episode_steps),
                             np.min(episode_steps), np.max(episode_steps)))

                if len(episode_rewards_alive):
                    print(
                        " alive mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_alive),
                                np.median(episode_rewards_alive),
                                np.min(episode_rewards_alive),
                                np.max(episode_rewards_alive)))

                if len(episode_rewards_progress):
                    print(
                        " progress mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_progress),
                                np.median(episode_rewards_progress),
                                np.min(episode_rewards_progress),
                                np.max(episode_rewards_progress)))

                if len(episode_rewards_servo):
                    print(
                        " servo mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_servo),
                                np.median(episode_rewards_servo),
                                np.min(episode_rewards_servo),
                                np.max(episode_rewards_servo)))

                if len(episode_dist_to_target):
                    print(
                        " dist to target mean/median {:.3f}/{:.3f} min/max {:.3f}/{:.3f}"
                        .format(np.mean(episode_dist_to_target),
                                np.median(episode_dist_to_target),
                                np.min(episode_dist_to_target),
                                np.max(episode_dist_to_target)))

                print(
                    " Reward/Steps {:.3f} Progress/Steps: {:.3f} entropy {:.1f} value_loss {:.5f} action_loss {:.5f}\n"
                    .format(
                        np.mean(episode_rewards) / np.mean(episode_steps),
                        (0 if len(episode_rewards_progress) == 0 else
                         np.mean(episode_rewards_progress) /
                         np.mean(episode_steps)), dist_entropy, value_loss,
                        action_loss))
Exemplo n.º 2
0
    def __init__(self,
                 env_def,
                 processes=1,
                 dir='.',
                 version=0,
                 lr=2e-4,
                 architecture='base',
                 dropout=0,
                 reconstruct=None,
                 r_weight=.05):
        self.env_def = env_def
        self.num_processes = processes  #cpu processes
        self.lr = lr
        self.version = version
        self.save_dir = dir + '/trained_models/'

        #Setup
        pathlib.Path(self.save_dir).mkdir(parents=True, exist_ok=True)
        if (self.num_mini_batch > processes):
            self.num_mini_batch = processes

        self.writer = SummaryWriter()
        self.total_steps = 0

        #State
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        if not self.no_cuda and torch.cuda.is_available(
        ) and self.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True

        utils.cleanup_log_dir(self.log_dir)
        utils.cleanup_log_dir(self.eval_log_dir)

        torch.set_num_threads(1)

        self.level_path = None
        self.envs = None
        self.num_envs = -1
        self.set_envs(num_envs=1)

        if (version > 0):
            self.actor_critic = self.load(path, version)
        else:
            self.actor_critic = Policy(
                self.envs.observation_space.shape,
                self.envs.action_space,
                base_kwargs={
                    'recurrent': self.recurrent_policy,
                    'shapes': list(reversed(self.env_def.model_shape)),
                    'dropout': dropout
                },
                model=architecture)
        self.actor_critic.to(self.device)

        #Reconstruction
        self.reconstruct = reconstruct is not None
        if (self.reconstruct):
            #layers = self.envs.observation_space.shape[0]
            #shapes = list(self.env_def.model_shape)
            #self.r_model = Decoder(layers, shapes=shapes).to(self.device)
            reconstruct.to(self.device)
            self.r_model = lambda x: reconstruct.adapter(reconstruct(x))
            #self.r_model = lambda x: reconstruct.adapter(reconstruct(x)).clamp(min=1e-6).log()
            #self.r_loss = nn.L1Loss() #nn.NLLLoss() #nn.MSELoss()
            self.r_loss = lambda pred, true: -r_weight * (true * torch.log(
                pred.clamp(min=1e-7, max=1 - 1e-7))).sum(dim=1).mean()
            self.r_optimizer = reconstruct.optimizer  #optim.Adam(reconstruct.parameters(), lr = .0001)

        if self.algo == 'a2c':
            self.agent = A2C_ACKTR(self.actor_critic,
                                   self.value_loss_coef,
                                   self.entropy_coef,
                                   lr=self.lr,
                                   eps=self.eps,
                                   alpha=self.alpha,
                                   max_grad_norm=self.max_grad_norm)
        elif self.algo == 'ppo':
            self.agent = PPO(self.actor_critic,
                             self.clip_param,
                             self.ppo_epoch,
                             self.num_mini_batch,
                             self.value_loss_coef,
                             self.entropy_coef,
                             lr=self.lr,
                             eps=self.eps,
                             max_grad_norm=self.max_grad_norm,
                             use_clipped_value_loss=False)
        elif self.algo == 'acktr':
            self.agent = algo.A2C_ACKTR(self.actor_critic,
                                        self.value_loss_coef,
                                        self.entropy_coef,
                                        acktr=True)

        self.gail = False
        self.gail_experts_dir = './gail_experts'
        if self.gail:
            assert len(self.envs.observation_space.shape) == 1
            self.gail_discr = gail.Discriminator(
                self.envs.observation_space.shape[0] +
                self.envs.action_space.shape[0], 100, self.device)
            file_name = os.path.join(
                self.gail_experts_dir,
                "trajs_{}.pt".format(env_name.split('-')[0].lower()))

            self.gail_train_loader = torch.utils.data.DataLoader(
                gail.ExpertDataset(file_name,
                                   num_trajectories=4,
                                   subsample_frequency=20),
                batch_size=self.gail_batch_size,
                shuffle=True,
                drop_last=True)

        self.rollouts = RolloutStorage(
            self.num_steps, self.num_processes,
            self.envs.observation_space.shape, self.envs.action_space,
            self.actor_critic.recurrent_hidden_state_size)
Exemplo n.º 3
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:" + str(args.cuda_id) if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    ########## file related
    filename = args.env_name + "_" + args.algo + "_n" + str(args.max_episodes)
    if args.attack:
        filename += "_" + args.type + "_" + args.aim
        filename += "_s" + str(args.stepsize) + "_m" + str(
            args.maxiter) + "_r" + str(args.radius) + "_f" + str(args.frac)
    if args.run >= 0:
        filename += "_run" + str(args.run)

    logger = get_log(args.logdir + filename + "_" + current_time)
    logger.info(args)

    rew_file = open(args.resdir + filename + ".txt", "w")

    if args.compute:
        radius_file = open(
            args.resdir + filename + "_radius" + "_s" + str(args.stepsize) +
            "_m" + str(args.maxiter) + "_th" + str(args.dist_thres) + ".txt",
            "w")
    if args.type == "targ" or args.type == "fgsm":
        targ_file = open(args.resdir + filename + "_targ.txt", "w")

    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    if args.type == "wb":
        attack_net = WbAttacker(agent,
                                envs,
                                int(args.frac * num_updates),
                                num_updates,
                                args,
                                device=device)
    if args.type == "bb":
        attack_net = BbAttacker(agent,
                                envs,
                                int(args.frac * num_updates),
                                num_updates,
                                args,
                                device=device)
    elif args.type == "rand":
        attack_net = RandAttacker(envs,
                                  radius=args.radius,
                                  frac=args.frac,
                                  maxat=int(args.frac * num_updates),
                                  device=device)
    elif args.type == "semirand":
        attack_net = WbAttacker(agent,
                                envs,
                                int(args.frac * num_updates),
                                num_updates,
                                args,
                                device,
                                rand_select=True)
    elif args.type == "targ":
        if isinstance(envs.action_space, Discrete):
            action_dim = envs.action_space.n
            target_policy = action_dim - 1
        elif isinstance(envs.action_space, Box):
            action_dim = envs.action_space.shape[0]
            target_policy = torch.zeros(action_dim)
#            target_policy[-1] = 1
        print("target policy is", target_policy)
        attack_net = TargAttacker(agent,
                                  envs,
                                  int(args.frac * num_updates),
                                  num_updates,
                                  target_policy,
                                  args,
                                  device=device)
    elif args.type == "fgsm":
        if isinstance(envs.action_space, Discrete):
            action_dim = envs.action_space.n
            target_policy = action_dim - 1
        elif isinstance(envs.action_space, Box):
            action_dim = envs.action_space.shape[0]
            target_policy = torch.zeros(action_dim)

        def targ_policy(obs):
            return target_policy

        attack_net = FGSMAttacker(envs,
                                  agent,
                                  targ_policy,
                                  radius=args.radius,
                                  frac=args.frac,
                                  maxat=int(args.frac * num_updates),
                                  device=device)
#    if args.aim == "obs" or aim == "hybrid":
#        obs_space = gym.make(args.env_name).observation_space
#        attack_net.set_obs_range(obs_space.low, obs_space.high)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    episode = 0

    start = time.time()

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            if args.type == "fgsm":
                #                print("before", rollouts.obs[step])
                rollouts.obs[step] = attack_net.attack(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step]).clone()
#                print("after", rollouts.obs[step])
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            if args.type == "targ" or args.type == "fgsm":
                if isinstance(envs.action_space, Discrete):
                    num_target = (
                        action == target_policy).nonzero()[:, 0].size()[0]
                    targ_file.write(
                        str(num_target / args.num_processes) + "\n")
                    print("percentage of target:",
                          num_target / args.num_processes)
                elif isinstance(envs.action_space, Box):
                    target_action = target_policy.repeat(action.size()[0], 1)
                    targ_file.write(
                        str(
                            torch.norm(action - target_action).item() /
                            args.num_processes) + "\n")
                    print("percentage of target:",
                          torch.sum(action).item() / args.num_processes)
            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action.cpu())
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    #                    rew_file.write("episode: {}, total reward: {}\n".format(episode, info['episode']['r']))
                    episode += 1

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        if args.attack and args.type != "fgsm":
            if args.aim == "reward":
                logger.info(rollouts.rewards.flatten())
                rollouts.rewards = attack_net.attack_r_general(
                    rollouts, next_value).clone().detach()
                logger.info("after attack")
                logger.info(rollouts.rewards.flatten())
            elif args.aim == "obs":
                origin = rollouts.obs.clone()
                rollouts.obs = attack_net.attack_s_general(
                    rollouts, next_value).clone().detach()
                logger.info(origin)
                logger.info("after")
                logger.info(rollouts.obs)
            elif args.aim == "action":
                origin = torch.flatten(rollouts.actions).clone()
                rollouts.actions = attack_net.attack_a_general(
                    rollouts, next_value).clone().detach()
                logger.info("attack value")
                logger.info(torch.flatten(rollouts.actions) - origin)
            elif args.aim == "hybrid":
                res_aim, attack = attack_net.attack_hybrid(
                    rollouts, next_value, args.radius_s, args.radius_a,
                    args.radius_r)
                print("attack ", res_aim)
                if res_aim == "obs":
                    origin = rollouts.obs.clone()
                    rollouts.obs = attack.clone().detach()
                    logger.info(origin)
                    logger.info("attack obs")
                    logger.info(rollouts.obs)
                elif res_aim == "action":
                    origin = torch.flatten(rollouts.actions).clone()
                    rollouts.actions = attack.clone().detach()
                    logger.info("attack action")
                    logger.info(torch.flatten(rollouts.actions) - origin)
                elif res_aim == "reward":
                    logger.info(rollouts.rewards.flatten())
                    rollouts.rewards = attack.clone().detach()
                    logger.info("attack reward")
                    logger.info(rollouts.rewards.flatten())
        if args.compute:
            stable_radius = attack_net.compute_radius(rollouts, next_value)
            print("stable radius:", stable_radius)
            radius_file.write("update: {}, radius: {}\n".format(
                j, np.round(stable_radius, decimals=3)))
        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        if args.attack and args.type == "bb":
            attack_net.learning(rollouts)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) >= 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            rew_file.write("updates: {}, mean reward: {}\n".format(
                j, np.mean(episode_rewards)))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)


#        if episode > args.max_episodes:
#            print("reach episodes limit")
#            break

    if args.attack:
        logger.info("total attacks: {}\n".format(attack_net.attack_num))
        print("total attacks: {}\n".format(attack_net.attack_num))

    rew_file.close()
    if args.compute:
        radius_file.close()
    if args.type == "targ" or args.type == "fgsm":
        targ_file.close()
Exemplo n.º 4
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    # coinrun environments need to be treated differently.
    coinrun_envs = {
        'CoinRun': 'standard',
        'CoinRun-Platforms': 'platform',
        'Random-Mazes': 'maze'
    }

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         device,
                         False,
                         coin_run_level=args.num_levels,
                         difficulty=args.high_difficulty,
                         coin_run_seed=args.seed)
    if args.env_name in coinrun_envs.keys():
        observation_space_shape = (3, 64, 64)
        args.save_dir = args.save_dir + "/NUM_LEVELS_{}".format(
            args.num_levels)  # Save the level info in the

    else:
        observation_space_shape = envs.observation_space.shape

    # trained model name
    if args.continue_ppo_training:
        actor_critic, _ = torch.load(os.path.join(args.check_point,
                                                  args.env_name + ".pt"),
                                     map_location=torch.device(device))
    elif args.cor_gail:
        embed_size = args.embed_size
        actor_critic = Policy(observation_space_shape,
                              envs.action_space,
                              hidden_size=args.hidden_size,
                              embed_size=embed_size,
                              base_kwargs={'recurrent': args.recurrent_policy})
        actor_critic.to(device)
        correlator = Correlator(observation_space_shape,
                                envs.action_space,
                                hidden_dim=args.hidden_size,
                                embed_dim=embed_size,
                                lr=args.lr,
                                device=device)

        correlator.to(device)
        embeds = torch.zeros(1, embed_size)
    else:
        embed_size = 0
        actor_critic = Policy(observation_space_shape,
                              envs.action_space,
                              hidden_size=args.hidden_size,
                              base_kwargs={'recurrent': args.recurrent_policy})
        actor_critic.to(device)
        embeds = None

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         use_clipped_value_loss=True,
                         ftrl_mode=args.cor_gail or args.no_regret_gail,
                         correlated_mode=args.cor_gail)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail or args.no_regret_gail or args.cor_gail:
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(
            file_name, num_trajectories=50,
            subsample_frequency=1)  #if subsample set to a different number,
        # grad_pen might need adjustment
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)
        if args.gail:
            discr = gail.Discriminator(observation_space_shape,
                                       envs.action_space,
                                       device=device)
        if args.no_regret_gail or args.cor_gail:
            queue = deque(
                maxlen=args.queue_size
            )  # Strategy Queues: Each element of a queue is a dicr strategy
            agent_queue = deque(
                maxlen=args.queue_size
            )  # Strategy Queues: Each element of a queue is an agent strategy
            pruning_frequency = 1
        if args.no_regret_gail:
            discr = regret_gail.NoRegretDiscriminator(observation_space_shape,
                                                      envs.action_space,
                                                      device=device)
        if args.cor_gail:
            discr = cor_gail.CorDiscriminator(observation_space_shape,
                                              envs.action_space,
                                              hidden_size=args.hidden_size,
                                              embed_size=embed_size,
                                              device=device)
        discr.to(device)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              observation_space_shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              embed_size)

    obs = envs.reset()

    rollouts.obs[0].copy_(obs)
    if args.cor_gail:
        rollouts.embeds[0].copy_(embeds)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions # Roll-out
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step], rollouts.embeds[step])

            obs, reward, done, infos = envs.step(action.to('cpu'))
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
            # Sample mediating/correlating actions # Correlated Roll-out
            if args.cor_gail:
                embeds, embeds_log_prob, mean = correlator.act(
                    rollouts.obs[step], rollouts.actions[step])
                rollouts.insert_embedding(embeds, embeds_log_prob)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1], rollouts.embeds[-1]).detach()

        if args.gail or args.no_regret_gail or args.cor_gail:
            if args.env_name not in {'CoinRun', 'Random-Mazes'}:
                if j >= 10:
                    envs.venv.eval()

            gail_epoch = args.gail_epoch
            if args.gail:
                if j < 10:
                    gail_epoch = 100  # Warm up

                # no need for gail epoch or warm up in the no-regret case and cor_gail.
            for _ in range(gail_epoch):
                if utils.get_vec_normalize(envs):
                    obfilt = utils.get_vec_normalize(envs)._obfilt
                else:
                    obfilt = None

                if args.gail:
                    discr.update(gail_train_loader, rollouts, obfilt)

                if args.no_regret_gail or args.cor_gail:
                    last_strategy = discr.update(gail_train_loader, rollouts,
                                                 queue, args.max_grad_norm,
                                                 obfilt, j)

            for step in range(args.num_steps):
                if args.gail:
                    rollouts.rewards[step] = discr.predict_reward(
                        rollouts.obs[step], rollouts.actions[step], args.gamma,
                        rollouts.masks[step])
                if args.no_regret_gail:
                    rollouts.rewards[step] = discr.predict_reward(
                        rollouts.obs[step], rollouts.actions[step], args.gamma,
                        rollouts.masks[step], queue)
                if args.cor_gail:
                    rollouts.rewards[
                        step], correlator_reward = discr.predict_reward(
                            rollouts.obs[step], rollouts.actions[step],
                            rollouts.embeds[step], args.gamma,
                            rollouts.masks[step], queue)

                    rollouts.correlated_reward[step] = correlator_reward

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        if args.gail:
            value_loss, action_loss, dist_entropy = agent.update(rollouts, j)

        elif args.no_regret_gail or args.cor_gail:
            value_loss, action_loss, dist_entropy, agent_gains, agent_strategy = \
                agent.mixed_update(rollouts, agent_queue, j)

        if args.cor_gail:
            correlator.update(rollouts, agent_gains, args.max_grad_norm)

        if args.no_regret_gail or args.cor_gail:
            queue, _ = utils.queue_update(queue, pruning_frequency,
                                          args.queue_size, j, last_strategy)
            agent_queue, pruning_frequency = utils.queue_update(
                agent_queue, pruning_frequency, args.queue_size, j,
                agent_strategy)

        rollouts.after_update()
        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            if not args.cor_gail:
                torch.save([
                    actor_critic,
                    getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
                ], os.path.join(save_path, args.env_name + ".pt"))

            else:
                print("saving models in {}".format(
                    os.path.join(save_path, args.env_name)))
                torch.save(
                    correlator.state_dict(),
                    os.path.join(save_path, args.env_name + "correlator.pt"))
                torch.save([
                    actor_critic.state_dict(),
                    getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
                ], os.path.join(save_path, args.env_name + "actor.pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f},"
                " value loss/action loss {:.1f}/{}".format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), value_loss, action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Exemplo n.º 5
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.env_name.startswith("lab_"):
        gym_name, flow_json = make_lab_env(args.env_name)

        args.env_name = gym_name

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(
            actor_critic,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            alpha=args.alpha,
            max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(
            actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir, "trajs_{}.pt".format(
                args.env_name.split('-')[0].lower()))
        
        expert_dataset = gail.ExpertDataset(
            file_name, num_trajectories=4, subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: "
                "mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Exemplo n.º 6
0
def main(arglist):

    FLAGS = flags.FLAGS
    FLAGS([""])

    # get all arguments from command line - there are lots!
    args = get_args(arglist)

    tb_log = LogRLAlgorithm(num_processes=args.num_processes)

    # set manual seeds for reproducability
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # set up logging directories
    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    # set singlethreaded + gpu if applicable.
    # Not sure how this interacts with num processes.
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    # make the gym environment
    envs = make_vec_envs(
        args.env_name,
        args.seed,
        args.num_processes,
        args.gamma,
        args.log_dir,
        device,
        False,
        1,
    )

    # Creates the policy network. Of note it the observation and action spaces used to
    # set input and output sizes
    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space,
        base_kwargs={"recurrent": args.recurrent_policy},
    )
    actor_critic.to(device)

    # Set up agent details
    if args.algo == "a2c":
        agent = algo.A2C_ACKTR(
            actor_critic,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            alpha=args.alpha,
            max_grad_norm=args.max_grad_norm,
        )
    elif args.algo == "ppo":
        agent = algo.PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
        )
    elif args.algo == "acktr":
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    # set up imitation learning if relevant
    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split("-")[0].lower()),
        )

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True,
        )

    # setup experience buffer
    rollouts = RolloutStorage(
        args.num_steps,
        args.num_processes,
        envs.observation_space.shape,
        envs.action_space,
        actor_critic.recurrent_hidden_state_size,
    )

    # initialise environment
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    # perform main learning loop for fixed duration.
    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer,
                j,
                num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr,
            )

        # for each episode
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step],
                )

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)
            tb_log.expl_data_collector.add_step(action, action_log_prob,
                                                reward, done, value)
            # for each thread
            for info in infos:
                if "episode" in info.keys():  # if episode ended
                    episode_rewards.append(info["episode"]["r"])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if "bad_transition" in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(
                obs,
                recurrent_hidden_states,
                action,
                action_log_prob,
                value,
                reward,
                masks,
                bad_masks,
            )

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1],
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1],
            ).detach()

        # do imitation learning
        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step],
                    rollouts.actions[step],
                    args.gamma,
                    rollouts.masks[step],
                )

        # compute returns
        rollouts.compute_returns(
            next_value,
            args.use_gae,
            args.gamma,
            args.gae_lambda,
            args.use_proper_time_limits,
        )
        # update agent
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        # re-initialise experience buffer with current state
        rollouts.after_update()

        # save model for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save(
                [
                    actor_critic,
                    getattr(utils.get_vec_normalize(envs), "ob_rms", None)
                ],
                os.path.join(save_path, args.env_name + ".pt"),
            )

        # log each log interval & print results to cmd.
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(
                    j,
                    total_num_steps,
                    int(total_num_steps / (end - start)),
                    len(episode_rewards),
                    np.mean(episode_rewards),
                    np.median(episode_rewards),
                    np.min(episode_rewards),
                    np.max(episode_rewards),
                    dist_entropy,
                    value_loss,
                    action_loss,
                ))

            # histogram parameters
            # layer = 0
            # for l in actor_critic.base.main:
            #     try:
            #         b = l.bias
            #         w = l.weight
            #         logger.record_histogram_dict(
            #             {f"{layer}/bias": b, f"{layer}/weight": w}, prefix="layer"
            #         )
            #         layer += 1
            #     except AttributeError:
            #         pass
            # tb_log._log_stats(int(j / args.log_interval))
            # tb_log.expl_data_collector.end_epoch(int(j / args.log_interval))

        # evaluate if requested
        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(
                actor_critic,
                ob_rms,
                args.env_name,
                args.seed,
                args.num_processes,
                eval_log_dir,
                device,
            )
Exemplo n.º 7
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy},
                          dimh=args.dimh)
    actor_critic.to(device)

    exp_name = "%s_%s_seed%d_dimh%d_" % (args.env_name, args.algo, args.seed,
                                         args.dimh)
    if args.gail:
        exp_name += '_gail_'

    if args.split:
        exp_name += 'splitevery' + str(args.split_every)
        if args.random_split:
            exp_name += '_rsplit'
    else:
        exp_name += 'baseline'

    writer = SummaryWriter('./runs/' + exp_name)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print(num_updates)
    stats = {
        'seed': args.seed,
        'experiment': exp_name,
        'env': args.env_name,
        'dimh': args.dimh,
        'split every': args.split_every,
        'random split': args.random_split,
        'steps': [],
        'mean reward': [],
        'actor neurons': [],
        'critic neurons': [],
    }
    save_dir = './experiment_results/%s/' % args.env_name
    stats_save_path = save_dir + exp_name
    check_path(save_dir)
    print('start')
    count = -1
    num_updates = 488 * 2
    meanreward = []
    for j in range(num_updates):
        #if j % 50 == 0:
        #    print('STEP', j)
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            count += 1
            if j % 488 == 0:
                count = 0
                total = 488 * 2
            else:
                total = 488 * 2
            if args.split:
                utils.update_linear_schedule(
                    agent.optimizer, count, total,
                    agent.optimizer.lr if args.algo == "acktr" else args.lr)
            else:
                utils.update_linear_schedule(
                    agent.optimizer, j, num_updates,
                    agent.optimizer.lr if args.algo == "acktr" else args.lr)
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        # splitting
        if args.split and (j + 1) % args.split_every == 0 and j < 200:
            print("[INFO] split on iteration %d..." % j)
            agent.split(rollouts, args.random_split)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))
        meanreward.append(np.mean(episode_rewards))
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()

            if True:
                print(
                    "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                    .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards), np.max(episode_rewards),
                            dist_entropy, value_loss, action_loss))
            stats['mean reward'].append(np.mean(episode_rewards))
            stats['steps'].append(j)
            if args.split:
                a, c = agent.actor_critic.get_num_params()
                stats['actor neurons'].append(a)
                stats['critic neurons'].append(c)

        if j % 10 == 0:
            print("[INFO] saving to ", stats_save_path)
            np.save(stats_save_path, stats)

        if j % 5 == 0:
            s = (j + 1) * args.num_processes * args.num_steps
            if args.split:
                a, c = agent.actor_critic.get_num_params()
                writer.add_scalar('A neurons', a, s)
                writer.add_scalar('C neurons', c, s)
            writer.add_scalar('mean reward', np.mean(episode_rewards), s)
            writer.add_scalar('entropy loss', dist_entropy, s)
            writer.add_scalar('value loss', value_loss, s)
            writer.add_scalar('action loss', action_loss, s)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

    writer.close()
    import pickle
    pickle.dump(meanreward, open(stats_save_path + '.pkl', 'wb'))
Exemplo n.º 8
0
def main():
    all_episode_rewards = []  ### 记录 6/29
    all_temp_rewards = []  ### 记录 6/29
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print('num_updates ', num_updates)
    print('num_steps ', args.num_steps)
    count = 0
    h5_path = './data/' + args.env_name
    if not os.path.exists(h5_path):
        os.makedirs(h5_path)
    h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % (count)
    data = {}
    data['states'] = []
    data['actions'] = []
    data['rewards'] = []
    data['done'] = []
    data['lengths'] = []

    episode_step = 0

    for j in range(num_updates):  ### num-steps

        temp_states = []
        temp_actions = []
        temp_rewards = []
        temp_done = []
        temp_lenthgs = []

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            if j == 0 and step == 0:
                print('obs ', type(rollouts.obs[step]),
                      rollouts.obs[step].shape)
                print('hidden_states ',
                      type(rollouts.recurrent_hidden_states[step]),
                      rollouts.recurrent_hidden_states[step].shape)
                print('action ', type(action), action.shape)
                print('action prob ', type(action_log_prob),
                      action_log_prob.shape)
                print('-' * 20)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            #print(infos)
            #print(reward)
            temp_states += [np.array(rollouts.obs[step].cpu())]
            temp_actions += [np.array(action.cpu())]
            #temp_rewards += [np.array(reward.cpu())]
            temp_rewards += [np.array([infos[0]['myrewards']])
                             ]  ### for halfcheetah不能直接用 reward !! 6/29
            temp_done += [np.array(done)]

            if j == 0 and step == 0:
                print('obs ', type(obs), obs.shape)
                print('reward ', type(reward), reward.shape)
                print('done ', type(done), done.shape)
                print('infos ', len(infos))
                for k, v in infos[0].items():
                    print(k, v.shape)
                print()

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    all_episode_rewards += [info['episode']['r']]  ### 记录 6/29

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        temp_lengths = len(temp_states)
        temp_states = np.concatenate(temp_states)
        temp_actions = np.concatenate(temp_actions)
        temp_rewards = np.concatenate(temp_rewards)
        temp_done = np.concatenate(temp_done)
        #print('temp_lengths',temp_lengths)
        #print('temp_states', temp_states.shape)
        #print('temp_actions', temp_actions.shape)
        #print('temp_rewards', temp_rewards.shape)
        if j > int(0.4 * num_updates):
            data['states'] += [temp_states]
            data['actions'] += [temp_actions]
            data['rewards'] += [temp_rewards]
            data['lengths'] += [temp_lengths]
            data['done'] += [temp_done]
            #print('temp_lengths',data['lengths'].shape)
            #print('temp_states', data['states'].shape)
            #print('temp_actions', data['actions'].shape)
            #print('temp_rewards', data['rewards'].shape)

            if args.save_expert and len(data['states']) >= 100:
                with h5py.File(h5_filename, 'w') as f:
                    f['states'] = np.array(data['states'])
                    f['actions'] = np.array(data['actions'])
                    f['rewards'] = np.array(data['rewards'])
                    f['done'] = np.array(data['done'])
                    f['lengths'] = np.array(data['lengths'])
                    #print('f_lengths',f['lengths'].shape)
                    #print('f_states', f['states'].shape)
                    #print('f_actions', f['actions'].shape)
                    #print('f_rewards', f['rewards'].shape)

                count += 1
                h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % (
                    count)
                data['states'] = []
                data['actions'] = []
                data['rewards'] = []
                data['done'] = []
                data['lengths'] = []

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + "_%d.pt" % (args.seed)))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            #np.save(os.path.join(save_path, args.env_name+"_%d"%(args.seed)), all_episode_rewards)  ### 保存记录 6/29
            #print(temp_rewards)
            print("temp rewards size", temp_rewards.shape, "mean",
                  np.mean(temp_rewards), "min", np.min(temp_rewards), "max",
                  np.max(temp_rewards))
            all_temp_rewards += [temp_rewards]
            np.savez(os.path.join(save_path,
                                  args.env_name + "_%d" % (args.seed)),
                     episode=all_episode_rewards,
                     timestep=all_temp_rewards)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
    '''data['states'] = np.array(data['states'])
Exemplo n.º 9
0
def main():
    args = get_args()

    # Record trajectories
    if args.record_trajectories:
        record_trajectories()
        return

    print(args)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # Append the model name
    log_dir = os.path.expanduser(args.log_dir)
    log_dir = os.path.join(log_dir, args.model_name, str(args.seed))

    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, log_dir, device, False)
    obs_shape = len(envs.observation_space.shape)

    # Take activation for carracing
    print("Loaded env...")
    activation = None
    if args.env_name == 'CarRacing-v0' and args.use_activation:
        activation = torch.tanh
    print(activation)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'env': args.env_name
                          },
                          activation=activation)
    actor_critic.to(device)
    # Load from previous model
    if args.load_model_name:
        state = torch.load(
            os.path.join(args.save_dir, args.load_model_name,
                         args.load_model_name + '_{}.pt'.format(args.seed)))[0]
        try:
            actor_critic.load_state_dict(state)
        except:
            actor_critic = state

    # If BCGAIL, then decay factor and gamma should be float
    if args.bcgail:
        assert type(args.decay) == float
        assert type(args.gailgamma) == float
        if args.decay < 0:
            args.decay = 1
        elif args.decay > 1:
            args.decay = 0.5**(1. / args.decay)

        print('Gamma: {}, decay: {}'.format(args.gailgamma, args.decay))
        print('BCGAIL used')

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         gamma=args.gailgamma,
                         decay=args.decay,
                         act_space=envs.action_space,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        if len(envs.observation_space.shape) == 1:

            # Load RED here
            red = None
            if args.red:
                red = gail.RED(
                    envs.observation_space.shape[0] +
                    envs.action_space.shape[0], 100, device, args.redsigma,
                    args.rediters)

            discr = gail.Discriminator(envs.observation_space.shape[0] +
                                       envs.action_space.shape[0],
                                       100,
                                       device,
                                       red=red,
                                       sail=args.sail,
                                       learn=args.learn)
            file_name = os.path.join(
                args.gail_experts_dir,
                "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

            expert_dataset = gail.ExpertDataset(file_name,
                                                num_trajectories=args.num_traj,
                                                subsample_frequency=1)
            args.gail_batch_size = min(args.gail_batch_size,
                                       len(expert_dataset))
            drop_last = len(expert_dataset) > args.gail_batch_size
            gail_train_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset,
                batch_size=args.gail_batch_size,
                shuffle=True,
                drop_last=drop_last)
            print("Data loader size", len(expert_dataset))
        else:
            # env observation shape is 3 => its an image
            assert len(envs.observation_space.shape) == 3
            discr = gail.CNNDiscriminator(envs.observation_space.shape,
                                          envs.action_space, 100, device)
            file_name = os.path.join(args.gail_experts_dir, 'expert_data.pkl')

            expert_dataset = gail.ExpertImageDataset(file_name,
                                                     act=envs.action_space)
            gail_train_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset,
                batch_size=args.gail_batch_size,
                shuffle=True,
                drop_last=len(expert_dataset) > args.gail_batch_size,
            )
            print('Dataloader size', len(gail_train_loader))

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print(num_updates)
    for j in range(num_updates):
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                try:
                    envs.venv.eval()
                except:
                    pass

            gail_epoch = args.gail_epoch
            if j < 10 and obs_shape == 1:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                if obs_shape == 1:
                    discr.update(gail_train_loader, rollouts,
                                 utils.get_vec_normalize(envs)._obfilt)
                else:
                    discr.update(gail_train_loader, rollouts, None)
            if obs_shape == 3:
                obfilt = None
            else:
                obfilt = utils.get_vec_normalize(envs)._rev_obfilt

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step], obfilt
                )  # The reverse function is passed down for RED to receive unnormalized obs which it is trained on

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        if args.bcgail:
            if obs_shape == 3:
                obfilt = None
            else:
                obfilt = utils.get_vec_normalize(envs)._obfilt
            value_loss, action_loss, dist_entropy = agent.update(
                rollouts, gail_train_loader, obfilt)
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.model_name)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic.state_dict(),
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None),
                getattr(utils.get_vec_normalize(envs), 'ret_rms', None)
            ],
                       os.path.join(
                           save_path,
                           args.model_name + "_{}.pt".format(args.seed)))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Exemplo n.º 10
0
def main():
    args = get_args()

    if args.state:
        assert args.algo == "ppo"

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    if args.load_dir:

        actor_critic, obsrms = torch.load(args.load_dir)
        vec_norm = utils.get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.train()
            vec_norm.ob_rms = obsrms
        actor_critic.base.deterministic = args.deterministic
        actor_critic.base.humanoid = args.env_name.startswith("SH")

    else:
        if args.state:
            actor_critic = StatePolicy(envs.observation_space.shape,
                                       envs.action_space,
                                       base_kwargs={
                                           'recurrent':
                                           args.recurrent_policy,
                                           'deterministic':
                                           args.deterministic,
                                           'hidden_size':
                                           args.code_size,
                                           'humanoid':
                                           args.env_name.startswith("SH")
                                       })
        else:
            actor_critic = Policy(
                envs.observation_space.shape,
                envs.action_space,
                base_kwargs={'recurrent': args.recurrent_policy},
            )
        actor_critic.to(device)

    if args.state:
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         beta=args.beta,
                         beta_end=args.beta_end,
                         state=True)
    elif args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    # A bunch of tensors; circular buffer

    if args.state:

        rollouts = RolloutStorage(args.num_steps,
                                  args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  actor_critic.recurrent_hidden_state_size,
                                  code_size=args.code_size)

        mis = []

    else:
        rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  actor_critic.recurrent_hidden_state_size)

    # Populate the first observation in rollouts
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    # Rewards is a deque
    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(num_updates):
        # print(j)
        agent.ratio = j / num_updates
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                if args.state:
                    value, action, action_log_prob, recurrent_hidden_states, eps, code = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

                    # import ipdb; ipdb.set_trace()

                else:
                    value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obs reward and next obs
            obs, reward, done, infos = envs.step(action)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])

            if args.state:
                rollouts.insert(obs,
                                recurrent_hidden_states,
                                action,
                                action_log_prob,
                                value,
                                reward,
                                masks,
                                bad_masks,
                                eps=eps,
                                code=code)
            else:
                rollouts.insert(obs, recurrent_hidden_states, action,
                                action_log_prob, value, reward, masks,
                                bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy, mi_loss = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + "_" + str(j) + ".pt"))

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()

            if args.state:
                print("DKL loss " + str(mi_loss))
                mis.append(mi_loss)

            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms

            if args.env_name.startswith("SH"):
                masses = [
                    0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.15, 1.25, 1.35, 1.45, 1.55
                ]
                damps = [
                    0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.15, 1.25, 1.35, 1.45, 1.55
                ]
                means = np.zeros((len(masses), len(damps)))
                stds = np.zeros((len(masses), len(damps)))
                for m_i in range(len(masses)):
                    for d_i in range(len(damps)):
                        m = masses[m_i]
                        d = masses[d_i]
                        u, s = evaluate(
                            actor_critic, ob_rms,
                            'OracleSHTest' + str(m) + "_" + str(d) + '-v0',
                            args.seed, args.num_processes, eval_log_dir,
                            device)
                        means[m_i, d_i] = u
                        stds[m_i, d_i] = s
                a, _ = args.load_dir.split(".")
                a = a.split("_")[-1]
                with open("sh_means_" + str(a) + ".npz", "wb") as f:
                    np.save(f, means)
                with open("sh_stds_" + str(a) + ".npz", "wb") as f:
                    np.save(f, stds)
            elif args.env_name.startswith("Oracle"):
                fs = [
                    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                    18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
                    33, 34, 35, 36, 37, 38, 39, 40
                ]
                ls = [
                    0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6,
                    0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.00, 1.05, 1.10,
                    1.15, 1.20, 1.25, 1.30, 1.35, 1.40, 1.45, 1.50, 1.55, 1.60,
                    1.65, 1.70
                ]
                a, _ = args.load_dir.split(".")
                a = a.split("_")[-1]

                means = np.zeros((len(fs), len(ls)))
                stds = np.zeros((len(fs), len(ls)))
                for f_i in range(len(fs)):
                    for l_i in range(len(ls)):
                        f = fs[f_i]
                        l = ls[l_i]
                        u, s = evaluate(
                            actor_critic, ob_rms, 'OracleCartpoleTest' +
                            str(f) + "_" + str(l) + '-v0', args.seed,
                            args.num_processes, eval_log_dir, device)
                        means[f_i, l_i] = u
                        stds[f_i, l_i] = s
                with open("cp_means" + str(a) + ".npz", "wb") as f:
                    np.save(f, means)
                with open("cp_stds" + str(a) + ".npz", "wb") as f:
                    np.save(f, stds)
            elif args.env_name.startswith("HC"):
                ds = [
                    50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600,
                    650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150,
                    1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650,
                    1700, 1750, 1800, 1850, 1900, 1950, 2000
                ]
                us = np.zeros_like(ds)
                ss = np.zeros_like(ds)
                for i in range(len(ds)):
                    d = ds[i]
                    u, s = evaluate(actor_critic, ob_rms,
                                    "OracleHalfCheetahTest_" + str(d) + "-v0",
                                    args.seed, args.num_processes,
                                    eval_log_dir, device)
                    us[i] = u
                    ss[i] = s
                a, _ = args.load_dir.split(".")
                a = a.split("_")[-1]
                with open("hc_means" + str(a) + ".npz", "wb") as f:
                    np.save(f, us)
                with open("hc_stds" + str(a) + ".npz", "wb") as f:
                    np.save(f, ss)
            assert False, "Evaluation Ended"
Exemplo n.º 11
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)
    # 固定随机种子 args.seed=
    # 为CPU设置种子用于生成随机数,以使得结果是确定的
    #torch.manual_seed(args.seed)
    # 为所有的GPU设置种子
    #torch.cuda.manual_seed_all(args.seed)
    # 没有使用GPU的时候设置的固定生成的随机数
    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False  #为整个网络的每个卷积层搜索最适合它的卷积实现算法,进而实现网络的加速
        torch.backends.cudnn.deterministic = True  #固定每次返回的卷积算法

    log_dir = os.path.expanduser(args.log_dir)  #把Linux ~展开
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)  #在指定路径建立文件,若已存在,则删除后重新创建。
    utils.cleanup_log_dir(eval_log_dir)  #在指定路径建立文件,若已存在,则删除后重新创建。

    torch.set_num_threads(1)  #限制Pytorch占用过多CPU资源。
    device = torch.device("cuda:0" if args.cuda else "cpu")  # 选择CPU 或 GPU

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy
                                       })  #main属性:卷积网络,用于卷积输入图像;
    #critic_linear属性:全连接网络,可能用于记录Q值。
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(args.gail_experts_dir, "trajs_{}.pt".format(
            args.env_name.split('-')
            [0].lower()))  #./ gail_experts / trajs_halfcheetah.pt
        # ------------start - ---------------
        # args.gail_experts_dir:./ gail_experts
        # trajs_{}.pt.format(args.env_name.split('-')[0].lower())): trajs_halfcheetah.pt
        # args.env_name.split('-'): ['HalfCheetah', 'v2']
        # args.env_name.split('-')[0].lower(): halfcheetah
        #file_name:./ gail_experts / trajs_halfcheetah.pt
        # - ------------end - ----------------
        # print("------------start----------------")
        # print("args.gail_experts_dir:", args.gail_experts_dir)
        # print("trajs_{}.pt.format(args.env_name.split('-')[0].lower())):", "trajs_{}.pt".format(
        #         args.env_name.split('-')[0].lower()))
        # print("args.env_name.split('-'):", args.env_name.split('-'))
        # print("args.env_name.split('-')[0].lower():", args.env_name.split('-')[0].lower())
        #print("file_name:", file_name)
        # print("-------------end-----------------")

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    episode_len = 0
    episode_return = 0
    episode_num = 0
    total_steps = 0

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            #print("value:", value, "action:", action, "action_log_prob:", action_log_prob, "recurrent_hidden_stes:", recurrent_hidden_states)
            # value: tensor([[-1.6006]], device='cuda:0')
            # action: tensor([[0.2846, 0.4442, 0.1657, -1.0094, -1.7039, 0.6084]],
            #                device='cuda:0')
            # action_log_prob: tensor([[-7.2011]], device='cuda:0')
            # recurrent_hidden_stes: tensor([[0.]], device='cuda:0')
            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            episode_len += 1
            episode_return += reward
            total_steps += 1

            if done:
                data = [episode_return, episode_len, total_steps]
                cav_path = "data/csv/GAIL_pytorch_Ant.csv"
                with open(cav_path, "a+", newline='') as f:
                    # print("-----------{Ant_sam.csv} added a new line!------------".format())
                    csv_writer = csv.writer(f)
                    csv_writer.writerow(data)
                episode_return, episode_len = 0, 0

                episode_len = 0
                episode_num += 1
                episode_return = 0

            #print("obs:", obs,"rewards:", reward,"donne:", done,"infos:", infos)


#           obs: tensor([[-0.2471,  0.5996, -0.4484,  1.2435,  0.1895,  1.3830, -0.6217,  0.7217,
#                         -0.6454,  2.1233, -0.8465, -0.5543,  1.2418, -0.9192,  2.0461, -0.7358,
#                         0.9339]], device='cuda:0')
#            rewards: tensor([[-0.0649]]) donne: [False]
#            infos: [{'reward_run': -0.6749512241805, 'reward_ctrl': -0.7481081485748291}]
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Exemplo n.º 12
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)
    # import pdb; pdb.set_trace()
    save_path = os.path.join(args.save_dir, args.algo)
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)
    # import pdb; pdb.set_trace()
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})

    # transforms = [ hl.transforms.Prune('Constant') ] # Removes Constant nodes from graph.

    # graph = hl.build_graph(actor_critic, torch.zeros([1, 1, 64, 64]), transforms=transforms)
    # graph.theme = hl.graph.THEMES['blue'].copy()
    # graph.save('rnn_hiddenlayer2', format='png')
    # print(args.re)
    # import pdb; pdb.set_trace()
    my_model_state_dict = actor_critic.state_dict()
    count = 0
    pretrained_weights = torch.load('net_main_4rh_v2_64.pth')
    # pretrained_weights = torch.load(os.path.join(save_path, args.env_name + "_ft.pt"))
    # pretrained_weights['']

    old_names = list(pretrained_weights.items())
    pretrained_weights_items = list(pretrained_weights.items())
    for key, value in my_model_state_dict.items():
        layer_name, weights = pretrained_weights_items[count]
        my_model_state_dict[key] = weights
        print(count)
        print(layer_name)
        count += 1
        if layer_name == 'enc_dense.bias':
            break
    # pretrained_weights = torch.load(os.path.join(save_path, args.env_name + "_random.pt"))[1]
    actor_critic.load_state_dict(my_model_state_dict)
    start_epoch = 0
    ka = 0

    # for param in actor_critic.parameters():
    #     ka += 1
    #     # import pdb; pdb.set_trace()
    #     param.requires_grad = False
    #     if ka == 14:
    #         break
    count = 0
    # import pdb; pdb.set_trace()n

    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    rewards_mean = []
    rewards_median = []
    val_loss = []
    act_loss = []
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(start_epoch, num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                actor_critic.state_dict(),
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + "_finetune.pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            rewards_mean.append(np.mean(episode_rewards))
            rewards_median.append(np.median(episode_rewards))
            val_loss.append(value_loss)
            act_loss.append(action_loss)
            torch.save(
                rewards_mean,
                "./plot_data/" + args.env_name + "_avg_rewards_finetune.pt")
            torch.save(
                rewards_median,
                "./plot_data/" + args.env_name + "_median_rewards_finetune.pt")
            # torch.save(val_loss, "./plot_data/"+args.env_name+"_val_loss_enc_weights.pt")
            # torch.save(act_loss, "./plot_data/"+args.env_name+"_act_loss_enc_weights.pt")

            plt.plot(rewards_mean)
            # print(plt_points2)
            plt.savefig("./imgs/" + args.env_name + "avg_reward_finetune.png")
            # plt.show(block = False)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Exemplo n.º 13
0
def run(args):
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.model.recurrent:
        assert args.algo in ['a2c', 'ppo'], \
            'Recurrent policy is not implemented for ACKTR'

    use_wandb = args.use_wandb
    eval_eps = args.eval_eps

    if use_wandb:
        experiment_name = f"{args.full_title}_{args.run_id}"
        from wandb_key import WANDB_API_KEY

        os.environ['WANDB_API_KEY'] = WANDB_API_KEY

        wandb.init(project="atari_ppo", name=experiment_name)
        wandb.config.update(dict(flatten_cfg(args)))

    if args.seed == 0:
        args.seed = args.run_id + 1

    print(f"SEED: {args.seed}")
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = args.out_dir

    os.environ['OPENAI_LOGDIR'] = args.out_dir

    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    flog = open(log_dir + "/logs.csv", 'w')
    log_writer = csv.DictWriter(flog, LOG_HEADER.keys())
    log_writer.writeheader()

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, log_dir, device, False)

    base_model = get_model(args.model, envs.observation_space.shape,
                           envs.action_space)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_model,
                          base_kwargs=args.model)
    actor_critic.to(device)

    print("Neural Network:")
    print(actor_critic)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    eval_episodes = args.eval_episodes
    repeat_eval_eps = getattr(args, "repeat_eval_eps", 1)
    eval_env_max_steps = 6000 * (eval_episodes // args.num_processes + 1)

    # load eval envs checkpoints
    num_batch_eval_ckpt_envs = 10
    eval_checkpoint_envs = np.load("env_test_states.npy",
                                   allow_pickle=True).item()[args.env_name]
    eval_checkpoint_envs = eval_checkpoint_envs[:(num_batch_eval_ckpt_envs *
                                                  args.num_processes)]  # 80

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {} | num timesteps {} | FPS {} | Last {} training episodes: mean/median "
                "reward {:.1f}/{:.1f} | min/max reward {:.1f}/{:.1f}".format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), np.min(episode_rewards),
                    np.max(episode_rewards), dist_entropy, value_loss,
                    action_loss))

            data_plot = {
                "update": j,
                "timesteps": total_num_steps,
                "reward": np.mean(episode_rewards),
                "median": np.median(episode_rewards),
                "min": np.min(episode_rewards),
                "max": np.max(episode_rewards),
                "dist_entropy": dist_entropy,
                "value_loss": value_loss,
                "action_loss": action_loss,
            }

            log_writer.writerow(data_plot)

            if use_wandb:
                wandb.log(data_plot, step=total_num_steps)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            total_num_steps = (j + 1) * args.num_processes * args.num_steps

            ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None)

            determinitistic = args.eval_determinitistic

            make_env_args = (args.env_name, args.seed + args.num_processes,
                             args.num_processes, args.gamma, eval_log_dir,
                             device, True)

            eval_args = (actor_critic, ob_rms, args.num_processes, device)
            eval_kwargs = dict({
                "deterministic": determinitistic,
                "eval_ep": eval_episodes,
                "max_steps": eval_env_max_steps,
                "eval_envs": None,
            })
            # --------------------------------------------------------------------------------------
            # Evaluate simple 1 step eps greedy

            eval_inf = dict()
            base_score = 0

            evaluations = [
                ("train",
                 dict({
                     "eps": 0.,
                     "repeat_eps": 1,
                     "use_rand_actions": True
                 })),
                ("eps",
                 dict({
                     "eps": eval_eps,
                     "repeat_eps": 1,
                     "use_rand_actions": True
                 })),
                ("eps_rp_10",
                 dict({
                     "eps": eval_eps,
                     "repeat_eps": 10,
                     "use_rand_actions": True
                 })),
                ("eps_rp_10_la",
                 dict({
                     "eps": eval_eps,
                     "repeat_eps": 10,
                     "use_rand_actions": False
                 })),
                ("eps10_rp_20",
                 dict({
                     "eps": eval_eps * 2,
                     "repeat_eps": 20,
                     "use_rand_actions": True
                 })),
            ]

            for eval_name, eval_custom_args in evaluations:
                eval_envs = make_vec_envs(*make_env_args)
                eval_kwargs["eval_envs"] = eval_envs
                eval_info = evaluate_same_env(*eval_args, **eval_kwargs,
                                              **eval_custom_args)
                eval_envs.close()

                if eval_info is not None:
                    for k, v in eval_info.items():
                        eval_inf[f"{eval_name}_{k}"] = v

                if eval_name == "train":
                    if eval_info is None:
                        base_score = None
                    else:
                        base_score = eval_info["eval_reward"]
                elif base_score is not None:
                    eval_inf[f"{eval_name}_gap"] = base_score - eval_info[
                        "eval_reward"]

            # --------------------------------------------------------------------------------------
            # Eval from checkpoint
            ckpt_r = []
            for batch_i in range(0, len(eval_checkpoint_envs),
                                 args.num_processes):
                eval_states = eval_checkpoint_envs[batch_i:batch_i +
                                                   args.num_processes]

                eval_envs = make_vec_envs_state(args.env_name,
                                                args.seed + args.num_processes,
                                                args.num_processes, args.gamma,
                                                eval_log_dir, device, True,
                                                eval_states)

                eval_kwargs["eval_envs"] = eval_envs
                eval_info = evaluate_first_ep(actor_critic,
                                              ob_rms,
                                              args.num_processes,
                                              device,
                                              eval_envs=eval_envs)
                eval_envs.close()

                ckpt_r += eval_info["eval_reward"]

            eval_inf[f"eval_ckpt_reward"] = np.mean(ckpt_r)
            if base_score != 0 and base_score is not None:
                eval_inf[f"eval_ckpt_reward_gap"] = base_score - np.mean(
                    ckpt_r)

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(ckpt_r), np.mean(ckpt_r)))

            # --------------------------------------------------------------------------------------

            if use_wandb and len(eval_inf) > 0:
                wandb.log(eval_inf, step=total_num_steps)
Exemplo n.º 14
0
        lr=args.lr,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm,
    )
elif args.algo == "random":
    agent = algo.RANDOM_AGENT(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True)

    actor_critic = RandomPolicy(
        obs_shape, envs.action_space, base_kwargs={"recurrent": args.recurrent_policy}, navi=args.navi, base=base,
    )
elif args.algo == "acktr":
    agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True)

if args.gail:
    assert len(envs.observation_space.shape) == 1
    discr = gail.Discriminator(envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device)
    file_name = os.path.join(args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split("-")[0].lower()))

    gail_train_loader = torch.utils.data.DataLoader(
        gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20),
        batch_size=args.gail_batch_size,
        shuffle=True,
        drop_last=True,
    )

rollouts = RolloutStorage(
    args.num_steps,
    args.num_processes,
    envs.observation_space.shape,
    envs.action_space,
    actor_critic.recurrent_hidden_state_size,
Exemplo n.º 15
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    save_name = '%s_%s' % (args.env_name, args.algo)
    if args.postfix != '':
        save_name += ('_' + args.postfix)

    logger_filename = os.path.join(log_dir, save_name)
    logger = utils.create_logger(logger_filename)

    torch.set_num_threads(1)
    device = torch.device("cuda:%d" % args.gpu if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         device,
                         False,
                         4,
                         obs_type="grid" if args.grid else "image",
                         skip_frames=args.num_skip_frames)

    if args.load_dir != None:
        actor_critic, ob_rms = \
                torch.load(os.path.join(args.load_dir), map_location=lambda storage, loc: storage)
        vec_norm = utils.get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.ob_rms = ob_rms
        print("load pretrained...")
    else:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base="grid" if args.grid else None,
                              base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    lines = deque(maxlen=10)
    start = time.time()
    kk = 0
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    # learning_start = 1000
    learning_start = 0
    best_reward = -100
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        explore = exploration_rate(j - learning_start, 'exp')
        # print(j)
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # if j < learning_start:
            #     action[0, 0] = random.randint(0, envs.action_space.n - 1)
            # elif random.uniform(0, 1) < explore:
            #     action[0, 0] = random.randint(0, envs.action_space.n - 1)
            # else:
            #     pass

            # Obser reward and next obs
            # action[0, 0] = 1
            # envs.take_turns()
            obs, reward, done, infos = envs.step(action)
            # print(obs)

            # im = Image.fromarray(obs[0].reshape(224 * 4, -1).cpu().numpy().astype(np.uint8))
            # im.save("samples/%d.png" % kk)
            # kk += 1
            # info = infos[0]
            # if len(info) > 0:
            #     print(info)
            # print(done)
            # print(infos)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                if 'sent' in info.keys():
                    lines.append(info['sent'])

            # kk += 1
            # print(action.shape)
            # print(obs.shape)
            # print(done.shape)
            # if done[0]:
            #     print(time.time() - start)
            #     print(kk)
            #     exit()

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "" \
                and np.mean(episode_rewards) > best_reward:
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            best_reward = np.mean(episode_rewards)
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, save_name + ".pt"))

        # print(episode_rewards)

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            if j < learning_start:
                logger.info("random action")
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            logger.info(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

            logger.info(
                ' lines sent: mean/median lines {:.1f}/{:.1f}, min/max lines {:.1f}/{:.1f}\n'
                .format(np.mean(lines), np.median(lines), np.min(lines),
                        np.max(lines)))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Exemplo n.º 16
0
def main():
    args = get_args()

    # Record trajectories
    if args.record_trajectories:
        record_trajectories()
        return

    print(args)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # Append the model name
    log_dir = os.path.expanduser(args.log_dir)
    log_dir = os.path.join(log_dir, args.model_name, str(args.seed))

    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, log_dir, device, False)

    # Take activation for carracing
    print("Loaded env...")
    activation = None
    if args.env_name == 'CarRacing-v0' and args.use_activation:
        activation = torch.tanh
    print(activation)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'env': args.env_name
                          },
                          activation=activation)
    actor_critic.to(device)
    # Load from previous model
    if args.load_model_name:
        state = torch.load(
            os.path.join(args.save_dir, args.load_model_name,
                         args.load_model_name + '_{}.pt'.format(args.seed)))[0]
        try:
            actor_critic.load_state_dict(state)
        except:
            actor_critic = state

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        if len(envs.observation_space.shape) == 1:
            discr = gail.Discriminator(
                envs.observation_space.shape[0] + envs.action_space.shape[0],
                100, device)
            file_name = os.path.join(
                args.gail_experts_dir,
                "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

            expert_dataset = gail.ExpertDataset(file_name,
                                                num_trajectories=3,
                                                subsample_frequency=1)
            expert_dataset_test = gail.ExpertDataset(file_name,
                                                     num_trajectories=1,
                                                     start=3,
                                                     subsample_frequency=1)
            drop_last = len(expert_dataset) > args.gail_batch_size
            gail_train_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset,
                batch_size=args.gail_batch_size,
                shuffle=True,
                drop_last=drop_last)
            gail_test_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset_test,
                batch_size=args.gail_batch_size,
                shuffle=False,
                drop_last=False)
            print(len(expert_dataset), len(expert_dataset_test))
        else:
            # env observation shape is 3 => its an image
            assert len(envs.observation_space.shape) == 3
            discr = gail.CNNDiscriminator(envs.observation_space.shape,
                                          envs.action_space, 100, device)
            file_name = os.path.join(args.gail_experts_dir, 'expert_data.pkl')

            expert_dataset = gail.ExpertImageDataset(file_name, train=True)
            test_dataset = gail.ExpertImageDataset(file_name, train=False)
            gail_train_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset,
                batch_size=args.gail_batch_size,
                shuffle=True,
                drop_last=len(expert_dataset) > args.gail_batch_size,
            )
            gail_test_loader = torch.utils.data.DataLoader(
                dataset=test_dataset,
                batch_size=args.gail_batch_size,
                shuffle=False,
                drop_last=len(test_dataset) > args.gail_batch_size,
            )
            print('Dataloader size', len(gail_train_loader))

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    start = time.time()
    #num_updates = int(
    #args.num_env_steps) // args.num_steps // args.num_processes
    num_updates = args.num_steps
    print(num_updates)

    # count the number of times validation loss increases
    val_loss_increase = 0
    prev_val_action = np.inf
    best_val_loss = np.inf

    for j in range(num_updates):
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                try:
                    envs.venv.eval()
                except:
                    pass

            gail_epoch = args.gail_epoch
            #if j < 10:
            #gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                #discr.update(gail_train_loader, rollouts,
                #None)
                pass

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        #value_loss, action_loss, dist_entropy = agent.update(rollouts)
        value_loss = 0
        dist_entropy = 0
        for data in gail_train_loader:
            expert_states, expert_actions = data
            expert_states = Variable(expert_states).to(device)
            expert_actions = Variable(expert_actions).to(device)
            loss = agent.update_bc(expert_states, expert_actions)
            action_loss = loss.data.cpu().numpy()
        print("Epoch: {}, Loss: {}".format(j, action_loss))

        with torch.no_grad():
            cnt = 0
            val_action_loss = 0
            for data in gail_test_loader:
                expert_states, expert_actions = data
                expert_states = Variable(expert_states).to(device)
                expert_actions = Variable(expert_actions).to(device)
                loss = agent.get_action_loss(expert_states, expert_actions)
                val_action_loss += loss.data.cpu().numpy()
                cnt += 1
            val_action_loss /= cnt
            print("Val Loss: {}".format(val_action_loss))

        #rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":

            if val_action_loss < best_val_loss:
                val_loss_increase = 0
                best_val_loss = val_action_loss
                save_path = os.path.join(args.save_dir, args.model_name)
                try:
                    os.makedirs(save_path)
                except OSError:
                    pass

                torch.save([
                    actor_critic.state_dict(),
                    getattr(utils.get_vec_normalize(envs), 'ob_rms', None),
                    getattr(utils.get_vec_normalize(envs), 'ret_rms', None)
                ],
                           os.path.join(
                               save_path,
                               args.model_name + "_{}.pt".format(args.seed)))
            elif val_action_loss > prev_val_action:
                val_loss_increase += 1
                if val_loss_increase == 10:
                    print("Val loss increasing too much, breaking here...")
                    break
            elif val_action_loss < prev_val_action:
                val_loss_increase = 0

            # Update prev val action
            prev_val_action = val_action_loss

        # log interval
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Exemplo n.º 17
0
def main():
    #wandb.run = config.tensorboard.run
    wandb.init(settings=wandb.Settings(start_method="fork"),
               project='growspaceenv_baselines',
               entity='growspace')
    #torch.manual_seed(config.seed)
    #torch.cuda.manual_seed_all(config.seed)

    if config.cuda and torch.cuda.is_available() and config.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(config.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if config.cuda else "cpu")

    envs = make_vec_envs(config.env_name, config.seed, config.num_processes,
                         config.gamma, config.log_dir, device, False,
                         config.custom_gym)

    if "Mnist" in config.env_name:
        base = 'Mnist'
    else:
        base = None

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base,
                          base_kwargs={'recurrent': config.recurrent_policy})
    actor_critic.to(device)

    if config.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               config.value_loss_coef,
                               config.entropy_coef,
                               lr=config.lr,
                               eps=config.eps,
                               alpha=config.alpha,
                               max_grad_norm=config.max_grad_norm)
    elif config.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         config.clip_param,
                         config.ppo_epoch,
                         config.num_mini_batch,
                         config.value_loss_coef,
                         config.entropy_coef,
                         lr=config.lr,
                         eps=config.eps,
                         max_grad_norm=config.max_grad_norm,
                         optimizer=config.optimizer,
                         momentum=config.momentum)
    elif config.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               config.value_loss_coef,
                               config.entropy_coef,
                               acktr=True)

    if config.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            config.gail_experts_dir,
            "trajs_{}.pt".format(config.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > config.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=config.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    rollouts = RolloutStorage(config.num_steps, config.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = []
    episode_length = []
    episode_branches = []
    episode_branch1 = []
    episode_branch2 = []
    episode_light_width = []
    episode_light_move = []
    episode_success = []
    episode_plantpixel = []

    start = time.time()
    num_updates = int(
        config.num_env_steps) // config.num_steps // config.num_processes
    x = 0
    action_space_type = envs.action_space

    for j in range(num_updates):

        if isinstance(action_space_type, Discrete):
            action_dist = np.zeros(envs.action_space.n)

        total_num_steps = (j + 1) * config.num_processes * config.num_steps

        if config.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if config.algo == "acktr" else config.lr)
        #new_branches = []
        for step in range(config.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            if isinstance(action_space_type, Discrete):
                action_dist[action] += 1

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_length.append(info['episode']['l'])
                    wandb.log({"Episode_Reward": info['episode']['r']},
                              step=total_num_steps)

                if 'new_branches' in info.keys():
                    episode_branches.append(info['new_branches'])

                if 'new_b1' in info.keys():
                    episode_branch1.append(info['new_b1'])

                if 'new_b2' in info.keys():
                    episode_branch2.append(info['new_b2'])

                if 'light_width' in info.keys():
                    episode_light_width.append(info['light_width'])

                if 'light_move' in info.keys():
                    episode_light_move.append(info['light_move'])

                if 'success' in info.keys():
                    episode_success.append(info['success'])

                if 'plant_pixel' in info.keys():
                    episode_plantpixel.append(info['plant_pixel'])

                if j == x:
                    if 'img' in info.keys():
                        img = info['img']
                        path = './hittiyas/growspaceenv_braselines/scripts/imgs/'
                        cv2.imwrite(
                            os.path.join(path, 'step' + str(step) + '.png'),
                            img)
                    x += 1000

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if config.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = config.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(config.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], config.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, config.use_gae, config.gamma,
                                 config.gae_lambda,
                                 config.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % config.save_interval == 0
                or j == num_updates - 1) and config.save_dir != "":
            save_path = os.path.join(config.save_dir, config.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, config.env_name + ".pt"))

        if j % config.log_interval == 0 and len(episode_rewards) > 1:

            if isinstance(action_space_type, Discrete):
                np_hist = np.histogram(np.arange(action_dist.shape[0]),
                                       weights=action_dist)
                wandb.log(
                    {
                        "Discrete Actions":
                        wandb.Histogram(np_histogram=np_hist)
                    },
                    step=total_num_steps)
            wandb.log({"Reward Min": np.min(episode_rewards)},
                      step=total_num_steps)
            wandb.log({"Summed Reward": np.sum(episode_rewards)},
                      step=total_num_steps)
            wandb.log({"Reward Mean": np.mean(episode_rewards)},
                      step=total_num_steps)
            wandb.log({"Reward Max": np.max(episode_rewards)},
                      step=total_num_steps)
            wandb.log(
                {"Number of Mean New Branches": np.mean(episode_branches)},
                step=total_num_steps)
            wandb.log({"Number of Max New Branches": np.max(episode_branches)},
                      step=total_num_steps)
            wandb.log({"Number of Min New Branches": np.min(episode_branches)},
                      step=total_num_steps)
            wandb.log(
                {
                    "Number of Mean New Branches of Plant 1":
                    np.mean(episode_branch1)
                },
                step=total_num_steps)
            wandb.log(
                {
                    "Number of Mean New Branches of Plant 2":
                    np.mean(episode_branch2)
                },
                step=total_num_steps)
            wandb.log(
                {
                    "Number of Total Displacement of Light":
                    np.sum(episode_light_move)
                },
                step=total_num_steps)
            wandb.log({"Mean Light Displacement": episode_light_move},
                      step=total_num_steps)
            wandb.log({"Mean Light Width": episode_light_width},
                      step=total_num_steps)
            wandb.log(
                {
                    "Number of Steps in Episode with Tree is as close as possible":
                    np.sum(episode_success)
                },
                step=total_num_steps)
            wandb.log({"Entropy": dist_entropy}, step=total_num_steps)
            wandb.log(
                {
                    "Displacement of Light Position":
                    wandb.Histogram(episode_light_move)
                },
                step=total_num_steps)
            wandb.log(
                {
                    "Displacement of Beam Width":
                    wandb.Histogram(episode_light_width)
                },
                step=total_num_steps)
            wandb.log({"Mean Plant Pixel": np.mean(episode_plantpixel)},
                      step=total_num_steps)
            wandb.log({"Summed Plant Pixel": np.sum(episode_plantpixel)},
                      step=total_num_steps)
            wandb.log(
                {"Plant Pixel Histogram": wandb.Histogram(episode_plantpixel)},
                step=total_num_steps)

            episode_rewards.clear()
            episode_length.clear()
            episode_branches.clear()
            episode_branch2.clear()
            episode_branch1.clear()
            episode_light_move.clear()
            episode_light_width.clear()
            episode_success.clear()
            episode_plantpixel.clear()

        if (config.eval_interval is not None and len(episode_rewards) > 1
                and j % config.eval_interval == 0):
            ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            evaluate(actor_critic, ob_rms, config.env_name, config.seed,
                     config.num_processes, eval_log_dir, device,
                     config.custom_gym)

    ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
    evaluate(actor_critic,
             ob_rms,
             config.env_name,
             config.seed,
             config.num_processes,
             eval_log_dir,
             device,
             config.custom_gym,
             gif=True)
Exemplo n.º 18
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    #envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
    #                    args.gamma, args.log_dir, device, False)

    envs = make_parallel_env(args.env_name, args.num_processes, args.seed, True)

    '''
    actor_critic = Policy(
        envs.observation_space[0].shape,
        envs.action_space[0],
        agent_num=args.agent_num, 
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)
    '''
    actor_critic = []
    for i in range(args.agent_num):
        ac = Policy(
            envs.observation_space[0].shape,
            envs.action_space[0],
            agent_num=args.agent_num, 
            agent_i = i,
            base_kwargs={'recurrent': args.recurrent_policy})
        ac.to(device)
        actor_critic.append(ac)

    
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(
            actor_critic,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            alpha=args.alpha,
            max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        '''
        agent = algo.PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm)
        '''
        agent = []
        for i in range(args.agent_num):
            agent.append(algo.PPO(
                actor_critic[i],
                i,
                args.clip_param,
                args.ppo_epoch,
                args.num_mini_batch,
                args.value_loss_coef,
                args.entropy_coef,
                lr=args.lr,
                eps=args.eps,
                max_grad_norm=args.max_grad_norm,
                model_dir = args.model_dir))
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(
            actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir, "trajs_{}.pt".format(
                args.env_name.split('-')[0].lower()))
        
        expert_dataset = gail.ExpertDataset(
            file_name, num_trajectories=4, subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)
    '''   
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space[0].shape, envs.action_space[0],
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(torch.tensor(obs[:,0,:]))
    rollouts.to(device)
    '''

    rollouts = []
    for i in range(args.agent_num):
        rollout = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space[0].shape, envs.action_space[0],
                              actor_critic[i].recurrent_hidden_state_size,
                              args.agent_num, i)
        rollouts.append(rollout)

    obs = envs.reset()
    # pdb.set_trace()
    
    for i in range(args.agent_num):
        rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1)))
        rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:]))
        rollouts[i].to(device)
        
    episode_rewards = deque(maxlen=10)

    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print(num_updates)
    for j in range(num_updates):
        #pdb.set_trace()
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            for i in range(args.agent_num):
                utils.update_linear_schedule(agent[i].optimizer, j, num_updates, agent[i].optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            value_list, action_list, action_log_prob_list, recurrent_hidden_states_list = [], [], [], []
            with torch.no_grad():
                for i in range(args.agent_num):
                    #pdb.set_trace()
                    value, action, action_log_prob, recurrent_hidden_states = actor_critic[i].act(
                        rollouts[i].share_obs[step],
                        rollouts[i].obs[step], rollouts[i].recurrent_hidden_states[step],
                        rollouts[i].masks[step])
                    # import pdb; pdb.set_trace()
                    value_list.append(value)
                    action_list.append(action)
                    action_log_prob_list.append(action_log_prob)
                    recurrent_hidden_states_list.append(recurrent_hidden_states)
            # Obser reward and next obs
            action = []
            for i in range(args.num_processes):
                one_env_action = []
                for k in range(args.agent_num):
                    one_hot_action = np.zeros(envs.action_space[0].n)
                    one_hot_action[action_list[k][i]] = 1
                    one_env_action.append(one_hot_action)
                action.append(one_env_action)
            #start = time.time()
            #pdb.set_trace()            
            obs, reward, done, infos = envs.step(action)
            # print(obs[0][0])
            # pdb.set_trace()
            #end = time.time()
            #print("step time: ", end-start)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            '''
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done[0]])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos[0]])
            '''
            masks = torch.ones(args.num_processes, 1)
            bad_masks = torch.ones(args.num_processes, 1)
            '''
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
            '''
            #import pdb; pdb.set_trace()
            for i in range(args.agent_num):
                rollouts[i].insert(torch.tensor(obs.reshape(args.num_processes, -1)), torch.tensor(obs[:,i,:]), 
                            recurrent_hidden_states, action_list[i],
                            action_log_prob_list[i], value_list[i], torch.tensor(reward[:, i].reshape(-1,1)), masks, bad_masks)
        #import pdb; pdb.set_trace()
        with torch.no_grad():
            next_value_list = []
            for i in range(args.agent_num):
                next_value = actor_critic[i].get_value(
                    rollouts[i].share_obs[-1],
                    rollouts[i].obs[-1], rollouts[i].recurrent_hidden_states[-1],
                    rollouts[i].masks[-1]).detach()
                next_value_list.append(next_value)

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])
        for i in range(args.agent_num):
            rollouts[i].compute_returns(next_value_list[i], args.use_gae, args.gamma,
                                    args.gae_lambda, args.use_proper_time_limits)

        #import pdb; pdb.set_trace()
        for i in range(args.agent_num):
            value_loss, action_loss, dist_entropy = agent[i].update(rollouts[i])
            if (i == 0):
                print("value loss: " + str(value_loss))
        # print(value_loss)
            # pdb.set_trace()

        #rollouts.after_update()
        obs = envs.reset()
        # pdb.set_trace()
        for i in range(args.agent_num):
            rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1)))
            rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:]))
            rollouts[i].to(device)

        # save for every interval-th episode or for the last epoch     
        #pdb.set_trace()   
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            if not os.path.exists(save_path + args.model_dir):
                os.makedirs(save_path + args.model_dir)
            for i in range(args.agent_num):
                torch.save([
                    actor_critic[i],
                    getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
                ], save_path + args.model_dir + '/agent_%i' % (i+1) + ".pt")
        '''
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
        '''
        '''
Exemplo n.º 19
0
    def run(self):
        args = self.args
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        print("CUDA is available: ", torch.cuda.is_available())
        if args.cuda:
            print("CUDA enabled")
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
        else:
            if args.cuda_deterministic:
                print("Warning CUDA is requested but is not available")
            else:
                print("CUDA disabled")

        log_dir = os.path.expanduser(args.log_dir)
        eval_log_dir = log_dir + "_eval"
        utils.cleanup_log_dir(log_dir)
        utils.cleanup_log_dir(eval_log_dir)
        print("get_num_thread", torch.get_num_threads())

        device = torch.device("cuda:0" if args.cuda else "cpu")

        envs = make_vec_envs(args.env_name, self.config_parameters, args.seed,
                             args.num_processes, args.gamma, args.log_dir,
                             device, False)

        actor_critic = create_IAM_model(envs, args, self.config_parameters)
        actor_critic.to(device)

        if args.algo == 'a2c':
            agent = algo.A2C_ACKTR(actor_critic,
                                   args.value_loss_coef,
                                   args.entropy_coef,
                                   lr=args.lr,
                                   eps=args.eps,
                                   alpha=args.alpha,
                                   max_grad_norm=args.max_grad_norm)
        # This algorithm should be used for the reproduction project.
        elif args.algo == 'ppo':
            agent = algo.PPO(actor_critic,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             max_grad_norm=args.max_grad_norm)
        elif args.algo == 'acktr':
            agent = algo.A2C_ACKTR(actor_critic,
                                   args.value_loss_coef,
                                   args.entropy_coef,
                                   acktr=True)

        if args.gail:
            assert len(envs.observation_space.shape) == 1
            discr = gail.Discriminator(
                envs.observation_space.shape[0] + envs.action_space.shape[0],
                100, device)
            file_name = os.path.join(
                args.gail_experts_dir,
                "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

            expert_dataset = gail.ExpertDataset(file_name,
                                                num_trajectories=4,
                                                subsample_frequency=20)
            drop_last = len(expert_dataset) > args.gail_batch_size
            gail_train_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset,
                batch_size=args.gail_batch_size,
                shuffle=True,
                drop_last=drop_last)

        rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  actor_critic.recurrent_hidden_state_size)

        obs = envs.reset()
        rollouts.obs[0].copy_(obs)
        rollouts.to(device)
        # Always return the average of the last 100 steps. This means the average is sampled.
        episode_rewards = deque(maxlen=100)

        start = time.time()
        num_updates = int(
            args.num_env_steps) // args.num_steps // args.num_processes
        for j in range(num_updates):

            if args.use_linear_lr_decay:
                # decrease learning rate linearly
                utils.update_linear_schedule(
                    agent.optimizer, j, num_updates,
                    agent.optimizer.lr if args.algo == "acktr" else args.lr)

            for step in range(args.num_steps):
                # Sample actions
                with torch.no_grad():
                    value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

                # Obser reward and next obs
                obs, reward, done, infos = envs.step(action)

                for info in infos:
                    if 'episode' in info.keys():
                        episode_rewards.append(info['episode']['r'])

                # If done then clean the history of observations.
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])
                bad_masks = torch.FloatTensor(
                    [[0.0] if 'bad_transition' in info.keys() else [1.0]
                     for info in infos])
                rollouts.insert(obs, recurrent_hidden_states, action,
                                action_log_prob, value, reward, masks,
                                bad_masks)

            with torch.no_grad():
                next_value = actor_critic.get_value(
                    rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                    rollouts.masks[-1]).detach()

            if args.gail:
                if j >= 10:
                    envs.venv.eval()

                gail_epoch = args.gail_epoch
                if j < 10:
                    gail_epoch = 100  # Warm up
                for _ in range(gail_epoch):
                    discr.update(gail_train_loader, rollouts,
                                 utils.get_vec_normalize(envs)._obfilt)

                for step in range(args.num_steps):
                    rollouts.rewards[step] = discr.predict_reward(
                        rollouts.obs[step], rollouts.actions[step], args.gamma,
                        rollouts.masks[step])

            rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                     args.gae_lambda,
                                     args.use_proper_time_limits)

            value_loss, action_loss, dist_entropy = agent.update(rollouts)

            rollouts.after_update()

            # save for every interval-th episode or for the last epoch
            if (j % args.save_interval == 0
                    or j == num_updates - 1) and args.save_dir != "":
                save_path = os.path.join(args.save_dir, args.algo)
                try:
                    os.makedirs(save_path)
                except OSError:
                    pass

                torch.save([
                    actor_critic,
                    getattr(utils.get_vec_normalize(envs), 'obs_rms', None)
                ], os.path.join(save_path, self.model_file_name))

            if j % args.log_interval == 0 and len(episode_rewards) > 1:
                total_num_steps = (j + 1) * args.num_processes * args.num_steps
                end = time.time()
                elapsed_time = end - start
                data = [
                    j,  # Updates
                    total_num_steps,  # timesteps
                    int(total_num_steps / elapsed_time),  # FPS
                    len(episode_rewards),  # Only useful for print statement
                    np.mean(episode_rewards),  # mean of rewards
                    np.median(episode_rewards),  # median of rewards
                    np.min(episode_rewards),  # min rewards
                    np.max(episode_rewards),  # max rewards
                    dist_entropy,
                    value_loss,
                    action_loss,
                    elapsed_time
                ]
                output = ''.join([str(x) + ',' for x in data])
                self.data_saver.append(output)
                print(
                    f"Updates {data[0]}, num timesteps {data[1]}, FPS {data[2]}, elapsed time {int(data[11])} sec. Last {data[3]} training episodes: mean/median reward {data[4]:.2f}/{data[5]:.2f}, min/max reward {data[6]:.1f}/{data[7]:.1f}",
                    end="\r")

            if (args.eval_interval is not None and len(episode_rewards) > 1
                    and j % args.eval_interval == 0):
                obs_rms = utils.get_vec_normalize(envs).obs_rms
                evaluate(actor_critic, obs_rms, args.env_name, args.seed,
                         args.num_processes, eval_log_dir, device)
Exemplo n.º 20
0
def main():

    args = get_args()
    writer = SummaryWriter(os.path.join('logs', args.save_name), )
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(
        basic_env.BasicFlatDiscreteEnv,
        args.seed,
        args.num_processes,
        args.gamma,
        args.log_dir,
        device,
        False,
        task='lift',
        gripper_type='RobotiqThreeFingerDexterousGripper',
        robot='Panda',
        controller='JOINT_TORQUE' if args.vel else 'JOINT_POSITION',
        horizon=1000,
        reward_shaping=True)

    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space,
        base=Surreal,
        # base=OpenAI,
        # base=MLP_ATTN,
        base_kwargs={
            'recurrent':
            args.recurrent_policy,
            # 'dims': basic_env.BasicFlatEnv().modality_dims
            'config':
            dict(act='relu' if args.relu else 'tanh', rec=args.rec, fc=args.fc)
        })
    print(actor_critic)
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    best_reward = 0
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)
            writer.add_scalar('lr', agent.optimizer.param_groups[0]['lr'])

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        end = time.time()
        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        if len(episode_rewards) > 1:
            writer.add_scalar('loss/value', value_loss, total_num_steps)
            writer.add_scalar('loss/policy', action_loss, total_num_steps)
            writer.add_scalar('experiment/num_updates', j, total_num_steps)
            writer.add_scalar('experiment/FPS',
                              int(total_num_steps / (end - start)),
                              total_num_steps)
            writer.add_scalar('experiment/EPISODE MEAN',
                              np.mean(episode_rewards), total_num_steps)
            writer.add_scalar('experiment/EPISODE MEDIAN',
                              np.median(episode_rewards), total_num_steps)
            writer.add_scalar('experiment/EPISODE MIN',
                              np.min(episode_rewards), total_num_steps)
            writer.add_scalar('experiment/EPSIDOE MAX',
                              np.max(episode_rewards), total_num_steps)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if len(episode_rewards) > 1 and args.save_dir != "":
            rew = np.mean(episode_rewards)
            if rew > best_reward:
                best_reward = rew
                print('saved with best reward', rew)

                save_path = os.path.join(args.save_dir, args.algo)
                try:
                    os.makedirs(save_path)
                except OSError:
                    pass

                torch.save([
                    actor_critic,
                    getattr(utils.get_vec_normalize(envs), 'obs_rms', None)
                ], os.path.join(save_path, args.save_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            obs_rms = utils.get_vec_normalize(envs).obs_rms
            evaluate(actor_critic, obs_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

        writer.close()
Exemplo n.º 21
0
def main():
    ##TensorboardX
    summary = SummaryWriter()

    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(
            actor_critic,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            alpha=args.alpha,
            max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(
            actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir, "trajs_{}.pt".format(
                args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(
                file_name, num_trajectories=4, subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    print(num_updates)