Exemplo n.º 1
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space)
    current_state = torch.zeros(args.num_processes, *obs_shape)

    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action = actor_critic.act(
                Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next state
            state, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)
            rollouts.insert(step, current_state, action.data, value.data,
                            reward, masks)

        next_value = actor_critic(Variable(rollouts.states[-1],
                                           volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.states[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            old_model.load_state_dict(actor_critic.state_dict())
            if hasattr(actor_critic, 'obs_filter'):
                old_model.obs_filter = actor_critic.obs_filter

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(
                    range(args.num_processes * args.num_steps)),
                                       args.batch_size * args.num_processes,
                                       drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    states_batch = rollouts.states[:-1].view(
                        -1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(
                        -1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                        Variable(states_batch), Variable(actions_batch))

                    _, old_action_log_probs, _ = old_model.evaluate_actions(
                        Variable(states_batch, volatile=True),
                        Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            print(
                "Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, (j + 1) * args.num_processes * args.num_steps,
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

        if j % args.vis_interval == 0:
            win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo)
Exemplo n.º 2
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    print(args.cuda)
    print(args.num_steps)
    print(args.num_processes)
    print(args.lr)
    print(args.eps)
    print(args.alpha)
    print(args.use_gae)
    print(args.gamma)
    print(args.tau)
    print(args.value_loss_coef)
    print(args.entropy_coef)
    # fdsafasd

    # if args.vis:
    #     from visdom import Visdom
    #     viz = Visdom()
    #     win = None

    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])

    # print('here3')
    # fdasf

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    # if args.algo == 'a2c':
    #     optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    if args.algo == 'ppo':

        # print ('OPTIMIZER')
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    # elif args.algo == 'acktr':
    #     optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space)
    #it has a self.state that is [steps, processes, obs]
    #steps is used to compute expected reward

    current_state = torch.zeros(args.num_processes, *obs_shape)

    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)
    #set the first state to current state

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):

            # Sample actions
            value, action = actor_critic.act(
                Variable(rollouts.states[step], volatile=True))
            # make prediction using state that you put into rollouts

            cpu_actions = action.data.squeeze(1).cpu().numpy()
            # Obser reward and next state
            state, reward, done, info = envs.step(cpu_actions)
            # print (state.shape) # [nProcesss, ndims, height, width]
            # fsdf
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet
            # oh its just clearing the env that finished, and resetting its episode_reward
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])  #if an env is done
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)

            rollouts.insert(step, current_state, action.data, value.data,
                            reward, masks)
            # insert all that info into current step
            # not exactly why

        next_value = actor_critic(Variable(rollouts.states[-1],
                                           volatile=True))[0].data
        # use last state to make prediction of next value

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(
                -1, *obs_shape))
        #not sure what this is

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        # this computes R =  r + r+ ...+ V(t)  for each step

        # if args.algo in ['a2c', 'acktr']:
        #     values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape)))
        #     # I think this aciton log prob could have been computed and stored earlier
        #     # and didnt we already store the value prediction???

        #     values = values.view(args.num_steps, args.num_processes, 1)
        #     action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

        #     advantages = Variable(rollouts.returns[:-1]) - values
        #     value_loss = advantages.pow(2).mean()

        #     action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        #     # if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
        #     #     # Sampled fisher, see Martens 2014
        #     #     actor_critic.zero_grad()
        #     #     pg_fisher_loss = -action_log_probs.mean()

        #     #     value_noise = Variable(torch.randn(values.size()))
        #     #     if args.cuda:
        #     #         value_noise = value_noise.cuda()

        #     #     sample_values = values + value_noise
        #     #     vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean()

        #     #     fisher_loss = pg_fisher_loss + vf_fisher_loss
        #     #     optimizer.acc_stats = True
        #     #     fisher_loss.backward(retain_graph=True)
        #     #     optimizer.acc_stats = False

        #     optimizer.zero_grad()
        #     (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

        #     # if args.algo == 'a2c':
        #     #     nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        #     optimizer.step()

        if args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            old_model.load_state_dict(actor_critic.state_dict())
            if hasattr(actor_critic, 'obs_filter'):
                old_model.obs_filter = actor_critic.obs_filter

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(
                    range(args.num_processes * args.num_steps)),
                                       args.batch_size * args.num_processes,
                                       drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    states_batch = rollouts.states[:-1].view(
                        -1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(
                        -1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                        Variable(states_batch), Variable(actions_batch))

                    _, old_action_log_probs, _ = old_model.evaluate_actions(
                        Variable(states_batch, volatile=True),
                        Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])
        # the first state is now the last state of the previous

        # if j % args.save_interval == 0 and args.save_dir != "":
        #     save_path = os.path.join(args.save_dir, args.algo)
        #     try:
        #         os.makedirs(save_path)
        #     except OSError:
        #         pass

        #     # A really ugly way to save a model to CPU
        #     save_model = actor_critic
        #     if args.cuda:
        #         save_model = copy.deepcopy(actor_critic).cpu()
        #     torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
Exemplo n.º 3
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = []
        win_dic ={}
        for i in range(len(mt_env_id_dic_selected)):
            win += [None]
        win_afs_per_m = None
        win_afs_loss = None
        win_basic_loss = None
    
    plot_dic = {}
    envs = []
    ''' Because the oral program has only one game per model, so Song add loop i
        So whatever you wanna run , just put in SubprocVecEnvMt!
    '''
    for i in range(len(mt_env_id_dic_selected)):
        log_dir = args.log_dir+mt_env_id_dic_selected[i]+'/'
        for j in range(args.num_processes):
            envs += [make_env(mt_env_id_dic_selected[i], args.seed, j, log_dir)]
    ''' This envs is an intergration of all the running env'''
    envs = SubprocVecEnvMt(envs)

    num_processes_total = args.num_processes * len(mt_env_id_dic_selected)
    '''(1,128,128)'''
    obs_shape = envs.observation_space.shape
    #num_stack :number of frames to stack
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    from arguments import is_restore
    if is_restore and args.save_dir:
        load_path = os.path.join(args.save_dir, args.algo)
        actor_critic =torch.load(os.path.join(load_path, args.env_name + ".pt"))
        # print ("restored previous model!")
        # print (actor_critic.Variable)
        # print (sss)
    else:
        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
    
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)
    #'args.num_steps: number of forward steps in A2C
    #rollouts is an intergration of state\ reward\ next state\action and so on
    rollouts = RolloutStorage(args.num_steps, num_processes_total, obs_shape, envs.action_space)
    current_state = torch.zeros(num_processes_total, *obs_shape)
    ''' not sure about it'''
    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        # print (shape_dim0)
        # print (sss)
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes_total, 1])
    final_rewards = torch.zeros([num_processes_total, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    from arguments import ewc, ewc_lambda, ewc_interval

    afs_per_m = []
    afs_offset = [0.0]*gtn_M

    afs_loss_list = []
    basic_loss_list = []
    episode_reward_rec = 0.0
    one = torch.FloatTensor([1]).cuda()
    mone = one * -1
    '''for one whole game '''
    for j in range(num_updates):
        for step in range(args.num_steps):
            if ewc == 1:
                try:
                    states_store = torch.cat([states_store, rollouts.states[step].clone()], 0)
                except Exception as e:
                    states_store = rollouts.states[step].clone()
            # Sample actions
            '''act fun refer to "observe it!"'''
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next state
            state, reward, done = envs.step(cpu_actions)
            '''record the last 100 episodes rewards'''
            episode_reward_rec += reward
            episode_reward_rec = rec_last_100_epi_reward(episode_reward_rec,done)
            
            
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            '''reward is shape of process_num_total, not batch-size'''
            # print ((reward).size())
            # print (done)
            # print (sss)
            episode_rewards += reward
            ################
            # rec_last_100_epi_reward(reward,done)
            
            # episode_reward_ppo += reward[0]
            # If done then clean the history of observations. final_rewards is used for compute after one whole num_step
            
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)
            rollouts.insert(step, current_state, action.data, value.data, reward, masks)

        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            # reset gradient
            optimizer.zero_grad()

            # forward
            values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape)))
            # pre-process
            values = values.view(args.num_steps, num_processes_total, 1)
            action_log_probs = action_log_probs.view(args.num_steps, num_processes_total, 1)

            # compute afs loss
            afs_per_m_temp, afs_loss = actor_critic.get_afs_per_m(
                action_log_probs=action_log_probs,
                conv_list=conv_list,
            )
            if len(afs_per_m_temp)>0:
                afs_per_m += [afs_per_m_temp]

            if (afs_loss is not None) and (afs_loss.data.cpu().numpy()[0]!=0.0):
                afs_loss.backward(mone, retain_graph=True)
                afs_loss_list += [afs_loss.data.cpu().numpy()[0]]

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()
            final_loss_basic = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef

            ewc_loss = None
            if j != 0:
                if ewc == 1:
                    ewc_loss = actor_critic.get_ewc_loss(lam=ewc_lambda)
            
            if ewc_loss is None:
                final_loss = final_loss_basic
            else:
                final_loss = final_loss_basic + ewc_loss
            # print (final_loss_basic.data.cpu().numpy()[0])
            # final_loss_basic
            basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]]
            final_loss.backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()

        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            old_model.load_state_dict(actor_critic.state_dict())
            if hasattr(actor_critic, 'obs_filter'):
                old_model.obs_filter = actor_critic.obs_filter

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(range(num_processes_total * args.num_steps)), args.batch_size * num_processes_total, drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(-1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch))

                    _, old_action_log_probs, _, old_conv_list= old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) - values).pow(2).mean()

                    optimizer.zero_grad()
                    final_loss_basic = (value_loss + action_loss - dist_entropy * args.entropy_coef)
                    
                    basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]]
                    final_loss_basic.backward()
                    optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])

        # if j % int(num_updates/2-10) == 0 and args.save_dir != "":
        if j % args.save_interval == 0 and args.save_dir != "":
         
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
            import pickle
            with open(os.path.join(save_path, args.env_name + "_last_100_reward"), "wb") as f:
                pickle.dump(reward_dict, f)



        if j % args.log_interval == 0:
            print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, (j + 1) * args.num_processes * args.num_steps,
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), -dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))

            try:
                print("ewc loss {:.5f}".
                format(ewc_loss.data.cpu().numpy()[0]))
            except Exception as e:
                pass
            

        if j > 5 and j % args.vis_interval == 0 and args.vis:
            ''' load from the folder'''
            for ii in range(len(mt_env_id_dic_selected)):
                log_dir = args.log_dir+mt_env_id_dic_selected[ii]+'/'
                win[ii] = visdom_plot(viz, win[ii], log_dir, mt_env_id_dic_selected[ii], args.algo)

            plot_dic = reward_dict
            for plot_name in plot_dic.keys():
                # if plot_name not in win_dic:
                # win_dic[plot_name] = None
                if plot_name in win_dic.keys():
                    if len(plot_dic[plot_name]) > 0:
                        win_dic[plot_name] = viz.line(
                            torch.from_numpy(np.asarray(plot_dic[plot_name])), 
                            win=win_dic[plot_name],
                            opts=dict(title=break_line_html(exp+'>>'+plot_name))
                        )
                    

                else:
                    win_dic[plot_name] = None
            
            if len(afs_per_m)>0:
                win_afs_per_m = viz.line(
                    torch.from_numpy(np.asarray(afs_per_m)), 
                    win=win_afs_per_m,
                    opts=dict(title=title_html+'>>afs')
                )

            # print (basic_loss_list)
            '''a2c:len(basic_loss_list) is vis_interval+1. because j start from 0
               ppo:len(basic_loss_list) is (vis_interval+1)*ppo_epoch_4*len(BatchSampler)
            '''
            
            # print (len(basic_loss_list))
            # print (ss)
            win_basic_loss = viz.line(
                torch.from_numpy(np.asarray(basic_loss_list)), 
                win=win_basic_loss,
                opts=dict(title=title_html+'>>basic_loss')
            )

            if len(afs_loss_list) > 0:
                win_afs_loss = viz.line(
                    torch.from_numpy(np.asarray(afs_loss_list)), 
                    win=win_afs_loss,
                    opts=dict(title=title_html+'>>afs_loss')
                )

        from arguments import parameter_noise, parameter_noise_interval
        if parameter_noise == 1:
            if j % parameter_noise_interval == 0:
                actor_critic.parameter_noise()

        if ewc == 1:
            if j % ewc_interval == 0 or j==0:
                actor_critic.compute_fisher(states_store)
                states_store = None
                actor_critic.star()
Exemplo n.º 4
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    # T choose whetehr to visualize
    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])
    # T get shape of observation array of the environment
    obs_shape = envs.observation_space.shape
    # T adjusting the shape; not sure what the * is
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    #T initialize the actor critic; MLP and CNN classes imported from model.py
    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    #T - some kind of setup with the actor_critic
    if args.finetune:
        checkpoint_path = save_path = os.path.join(args.save_dir, args.algo,
                                                   args.checkpoint)
        state_dict = torch.load(checkpoint_path)
        print("Finetuning from checkpoint: %s, at step: %d" %
              (checkpoint_path, state_dict['update']))
        actor_critic.load_state_dict(state_dict['model_state_dict'])
        keep_layers = [
            'v_fc3.weight', 'v_fc3.bias', 'a_fc2.weight', 'a_fc2.bias',
            'dist.fc_mean.weight', 'dist.fc_mean.bias', 'dist.logstd._bias'
        ]
        for name, param in actor_critic.named_parameters():
            if name not in keep_layers:
                param.requires_grad = False
        for name, param in actor_critic.named_parameters():
            print('Param name: %s, requires_grad: %d' %
                  (name, param.requires_grad))
    # T set up dimensions of the action space
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
    # T all arguments imported from arguments.py
    # T enable cuda pythorch tensor support
    if args.cuda:
        actor_critic.cuda()

    # T - pull arguments and choose algorithm and optimizer
    if args.algo == 'a2c':
        optimizer = optim.RMSprop(filter(lambda p: p.requires_grad,
                                         actor_critic.parameters()),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    #TO-DO figure out how to restore optimizer parameters when freezing some weights
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space)
    # return all zeros, so nothing observed
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    # T-not sure what this function is doing??
    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    # T - reset the environment; call function to update observation
    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    # T - initialize rewards to be zero
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    start = time.time()
    # T - begin iterative loop
    for j in range(num_updates):
        # T - take steps through single instance
        # T - this is the loop where action/critic happens
        for step in range(args.num_steps):
            # Sample actions
            # T - buried by the action method ultimately comes from torch.nn.Module
            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # T done bool returned by steps; indicates if failure occurred (done)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks
            #T - now update the observation matrix
            update_current_obs(obs)
            #T - store what happened in this step
            rollouts.insert(step, current_obs, action.data, value.data, reward,
                            masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            old_model.load_state_dict(actor_critic.state_dict())

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(
                    range(args.num_processes * args.num_steps)),
                                       args.batch_size * args.num_processes,
                                       drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    observations_batch = rollouts.observations[:-1].view(
                        -1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(
                        -1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(actions_batch))

                    _, old_action_log_probs, _ = old_model.evaluate_actions(
                        Variable(observations_batch, volatile=True),
                        Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            file_name = FILE_PREFIX + '.pt'
            #torch.save(save_model, os.path.join(save_path, file_name))
            data = {
                'update': j,
                'model_state_dict': save_model.state_dict(),
                'optim_state_dict': optimizer.state_dict()
            }
            torch.save(data, os.path.join(save_path, file_name))
        # T - write out some log information (not important for us)
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass