示例#1
0
    traintest = Dataloader(
        "/home/zhangjian/workspace/dataset/NUS-WIDE/resize64-rf-noncrop/train",
        0, 500, 1, 22, 'nus')
    flag = False

if dataset == 'flk':
    traintest = Dataloader("None", 0, 500, 1, 1, 'flk')
    flag = False

if flag:
    print('undefined_dataset')
    quit()

###model
model = ActorCritic(bit_len, batch_size)
model.cuda()
print('model over')

###train

episode_length = 1
while True:

    if episode_length % steps == 0:
        model.low_lr(rate)

    if (episode_length % 1000 == 0) and (episode_length > 20000):
        if dataset == 'cifar':
            model.eval()
            map = test_util.test(Dtest, model, batch_size, bit_len)
            file = open(logpath, "a")
示例#2
0
文件: train.py 项目: ddayzzz/ACER
def train(rank, args, T, shared_model, shared_average_model, optimiser):
    torch.manual_seed(args.seed + rank)
    # CUDA
    if args.use_cuda:
        torch.cuda.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)

    gpu_id = 0 if args.use_cuda else -1  # todo 0 代表第一个显卡
    if gpu_id >= 0:
        model = model.cuda()
    model.train()

    if not args.on_policy:
        # Normalise memory capacity by number of training processes
        memory = EpisodicReplayMemory(
            args.memory_capacity // args.num_processes,
            args.max_episode_length)

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # On-policy episode loop
        while True:
            # Sync with shared model at least every t_max steps
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    model.load_state_dict(shared_model.state_dict())
            else:
                model.load_state_dict(shared_model.state_dict())
            # Get starting timestep
            t_start = t

            # Reset or pass on hidden state
            if done:
                avg_hx = torch.zeros(1, args.hidden_size)
                avg_cx = torch.zeros(1, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(1, args.hidden_size).cuda()
                        cx = torch.zeros(1, args.hidden_size).cuda()
                else:
                    hx = torch.zeros(1, args.hidden_size)
                    cx = torch.zeros(1, args.hidden_size)

                # Reset environment and done flag
                state = state_to_tensor(env.reset())
                if gpu_id >= 0:
                    state = state.cuda()
                done, episode_length = False, 0
            else:
                # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
                hx = hx.detach()
                cx = cx.detach()

            # Lists of outputs for training
            policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], []

            while not done and t - t_start < args.t_max:
                # Calculate policy and values
                policy, Q, V, (hx, cx) = model(state, (hx, cx))

                # shared 模型在 CPU上, 需要转换
                if gpu_id >= 0:
                    to_avg_state = state.cpu()
                else:
                    to_avg_state = state
                average_policy, _, _, (avg_hx, avg_cx) = shared_average_model(
                    to_avg_state, (avg_hx, avg_cx))
                # if gpu_id >= 0:
                #     average_policies = average_policies.cuda()
                # Sample action
                action = torch.multinomial(policy, 1)[0, 0]

                # Step
                next_state, reward, done, _ = env.step(action.item())
                next_state = state_to_tensor(next_state)
                if gpu_id >= 0:
                    next_state = next_state.cuda()

                reward = args.reward_clip and min(max(
                    reward, -1), 1) or reward  # Optionally clamp rewards
                done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                episode_length += 1  # Increase episode counter

                if not args.on_policy:
                    # Save (beginning part of) transition for offline training
                    memory.append(state, action, reward,
                                  policy.detach())  # Save just tensors
                # Save outputs for online training
                [
                    arr.append(el) for arr, el in zip((
                        policies, Qs, Vs, actions, rewards,
                        average_policies), (policy, Q, V,
                                            torch.LongTensor([[action]]),
                                            torch.Tensor([[reward]]),
                                            average_policy))
                ]

                # Increment counters
                t += 1
                T.increment()

                # Update state
                state = next_state

            # Break graph for last values calculated (used for targets, not directly as model outputs)
            if done:
                # Qret = 0 for terminal s
                Qret = torch.zeros(1, 1)

                if not args.on_policy:
                    # Save terminal state for offline training
                    memory.append(state, None, None, None)
            else:
                # Qret = V(s_i; θ) for non-terminal s
                _, _, Qret, _ = model(state, (hx, cx))
                Qret = Qret.detach().cpu()

            # Train the network on-policy
            if gpu_id >= 0:
                Qs = list(map(lambda x: x.cpu(), Qs))
                Vs = list(map(lambda x: x.cpu(), Vs))
                policies = list(map(lambda x: x.cpu(), policies))
            _train(args, T, model, shared_model, shared_average_model,
                   optimiser, policies, Qs, Vs, actions, rewards, Qret,
                   average_policies)

            # Finish on-policy episode
            if done:
                break

        # Train the network off-policy when enough experience has been collected
        if not args.on_policy and len(memory) >= args.replay_start:
            # Sample a number of off-policy episodes based on the replay ratio
            for _ in range(_poisson(args.replay_ratio)):
                # Act and train off-policy for a batch of (truncated) episode
                trajectories = memory.sample_batch(args.batch_size,
                                                   maxlen=args.t_max)

                # Reset hidden state
                avg_hx = torch.zeros(args.batch_size, args.hidden_size)
                avg_cx = torch.zeros(args.batch_size, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                        cx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                else:

                    hx = torch.zeros(args.batch_size, args.hidden_size)
                    cx = torch.zeros(args.batch_size, args.hidden_size)

                # Lists of outputs for training
                policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], []

                # Loop over trajectories (bar last timestep)
                for i in range(len(trajectories) - 1):
                    # Unpack first half of transition
                    state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i]), 0)
                    action = torch.LongTensor([
                        trajectory.action for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    reward = torch.Tensor([
                        trajectory.reward for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    old_policy = torch.cat(
                        tuple(trajectory.policy
                              for trajectory in trajectories[i]), 0)

                    # Calculate policy and values
                    policy, Q, V, (hx, cx) = model(state, (hx, cx))
                    average_policy, _, _, (avg_hx,
                                           avg_cx) = shared_average_model(
                                               state, (avg_hx, avg_cx))

                    # Save outputs for offline training
                    [
                        arr.append(el)
                        for arr, el in zip((policies, Qs, Vs, actions, rewards,
                                            average_policies, old_policies), (
                                                policy, Q, V, action, reward,
                                                average_policy, old_policy))
                    ]

                    # Unpack second half of transition
                    next_state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i + 1]), 0)
                    done = torch.Tensor([
                        trajectory.action is None
                        for trajectory in trajectories[i + 1]
                    ]).unsqueeze(1)

                # Do forward pass for all transitions
                _, _, Qret, _ = model(next_state, (hx, cx))
                # Qret = 0 for terminal s, V(s_i; θ) otherwise
                Qret = ((1 - done) * Qret).detach().cpu()

                # Train the network off-policy
                if gpu_id >= 0:
                    Qs = list(map(lambda x: x.cpu(), Qs))
                    Vs = list(map(lambda x: x.cpu(), Vs))
                    policies = list(map(lambda x: x.cpu(), policies))
                _train(args,
                       T,
                       model,
                       shared_model,
                       shared_average_model,
                       optimiser,
                       policies,
                       Qs,
                       Vs,
                       actions,
                       rewards,
                       Qret,
                       average_policies,
                       old_policies=old_policies)
        done = True

    env.close()
示例#3
0
def test_multi(testing_scene,
               rank,
               shared_model,
               results,
               config,
               arguments=dict()):
    torch.manual_seed(arguments['seed'] + rank)

    env = MultiSceneEnv(testing_scene, config, arguments,
                        arguments['seed'] + rank)

    # gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])]
    gpu_id = -1
    print("Done initalizing process {}: {}! Use gpu: {}".format(
        rank, testing_scene, 'yes' if gpu_id >= 0 else 'no'))

    if shared_model is not None:
        # gpu_id = -1

        model = ActorCritic(config, arguments, gpu_id)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                model = model.cuda()
                model.load_state_dict(shared_model.state_dict())

                # print("[P{}] loaded model into cuda {}".format(rank, gpu_id))
        else:
            model.load_state_dict(shared_model.state_dict())
            # print("[P{}] loaded model".format(rank))

        model.eval()

    else:
        model = None
    state, score, target = env.reset()
    done = True

    for ep in range(1000):
        state, score, target = env.reset()
        agent_step = 0
        starting = env.current_state_id

        for step in range(arguments['num_iters']):
            if model is not None:
                with torch.no_grad():
                    value, logit = model(state, score, target)
                prob = F.softmax(logit, dim=-1)
                action = prob.max(1, keepdim=True)[1].cpu().numpy()
                # action = prob.multinomial(num_samples=1).detach().cpu().numpy()[0, 0]

            else:
                action = np.random.choice(range(arguments['action_size']))

            state, score, reward, done = env.step(action)
            ending = env.current_state_id

            if action < 2:
                agent_step += 1

            if done:
                break

        if not done:
            tm = results[target]
            tm.append(0)
            results[target] = tm
        else:
            if max(agent_step, env.shortest[ending, starting]) > 0:
                tm = results[target]
                tm.append(env.shortest[ending, starting] /
                          max(agent_step, env.shortest[ending, starting]))
                results[target] = tm
示例#4
0
文件: main.py 项目: Ameyapores/Mario
                    default=2,
                    help='number of non sampling processes (default: 2)')

mp = _mp.get_context('spawn')

print("Cuda: " + str(torch.cuda.is_available()))

if __name__ == '__main__':
    os.environ['OMP_NUM_THREADS'] = '1'

    args = parser.parse_args()
    env = setup_env(args.env_name)

    shared_model = ActorCritic(1, env.action_space.n)
    if args.use_cuda:
        shared_model.cuda()

    shared_model.share_memory()

    if os.path.isfile(args.save_path):
        print('Loading A3C parametets ...')
        shared_model.load_state_dict(torch.load(args.save_path))

    optimizer = SharedAdam(shared_model.parameters(), lr=args.lr)
    optimizer.share_memory()

    print("No of available cores : {}".format(mp.cpu_count()))

    processes = []

    counter = mp.Value('i', 0)
示例#5
0
def train(rank,
          args,
          shared_model,
          counter,
          lock,
          optimizer=None,
          select_sample=True):
    FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor
    LongTensor = torch.cuda.LongTensor if args.use_cuda else torch.LongTensor

    env = setup_env(args.env_name)

    model = ActorCritic(1, env.action_space.n)

    if args.use_cuda:
        model.cuda()

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = prepro(env.reset())
    state = torch.from_numpy(state)

    done = True
    episode_length = 0
    for num_iter in count():

        if rank == 0:

            if num_iter % args.save_interval == 0 and num_iter > 0:
                #print ("Saving model at :" + args.save_path)
                torch.save(shared_model.state_dict(), args.save_path)

        if num_iter % (
                args.save_interval * 2.5
        ) == 0 and num_iter > 0 and rank == 1:  # Second saver in-case first processes crashes
            #print ("Saving model for process 1 at :" + args.save_path)
            torch.save(shared_model.state_dict(), args.save_path)

        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 512)).type(FloatTensor)
            hx = Variable(torch.zeros(1, 512)).type(FloatTensor)
        else:
            cx = Variable(cx.data).type(FloatTensor)
            hx = Variable(hx.data).type(FloatTensor)

        values, log_probs, rewards, entropies = [], [], [], []
        actions, forwards, vec_st1s, inverses = [], [], [], []

        for step in range(args.num_steps):
            episode_length += 1
            state_inp = Variable(state.unsqueeze(0)).type(FloatTensor)
            value, logit, (hx, cx) = model((state_inp, (hx, cx)), False)
            s_t = state
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(-1, keepdim=True)
            entropies.append(entropy)

            if select_sample:
                action = prob.multinomial(num_samples=1).data
            else:
                action = prob.max(-1, keepdim=True)[1].data
            log_prob = log_prob.gather(-1, Variable(action))

            action_out = action.to(torch.device("cpu"))

            oh_action = torch.Tensor(1, env.action_space.n).type(LongTensor)
            oh_action.zero_()
            oh_action.scatter_(1, action, 1)
            a_t = oh_action.type(FloatTensor)
            #print ('action', a_t)

            state, reward, done, _ = env.step(action_out.numpy()[0][0])
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)
            #print ('extrinsic reward', reward)

            state = torch.from_numpy(prepro(state))
            s_t1 = state

            vec_st1, inverse, forward = model(
                (Variable(s_t.unsqueeze(0)).type(FloatTensor),
                 Variable(s_t1.unsqueeze(0)).type(FloatTensor), a_t), True)
            reward_intrinsic = args.eta * (
                (vec_st1 - forward).pow(2)).sum(1) / 2.
            reward_intrinsic = reward_intrinsic.to(torch.device("cpu"))
            #print('intrinsic reward', reward_intrinsic)

            reward += reward_intrinsic
            reward1 = reward_intrinsic
            #print('total_reward', reward)

            with lock:
                counter.value += 1

            if done:
                episode_length = 0
                state = torch.from_numpy(prepro(env.reset()))

            values.append(value)
            log_probs.append(log_prob)
            reward1 = reward1.type(FloatTensor)
            rewards.append(reward1)
            forwards.append(forward)
            vec_st1s.append(vec_st1)
            inverses.append(inverse)
            actions.append(a_t)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            state_inp = Variable(state.unsqueeze(0)).type(FloatTensor)
            value, _, _ = model((state_inp, (hx, cx)), False)
            R = value.data

        values.append(Variable(R).type(FloatTensor))
        policy_loss = 0
        value_loss = 0
        forward_loss = 0
        inverse_loss = 0
        R = Variable(R).type(FloatTensor)
        gae = torch.zeros(1, 1).type(FloatTensor)
        #print (rewards)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae).type(FloatTensor) - args.entropy_coef * entropies[i]

            forward_err = forwards[i] - vec_st1s[i]
            forward_loss = forward_loss + 0.5 * (forward_err.pow(2)).sum(1)

            cross_entropy = -(actions[i] *
                              torch.log(inverses[i] + 1e-15)).sum(1)
            inverse_loss = inverse_loss + cross_entropy

        #print ('forward loss', forward_loss)
        #print ('inverse loss', inverse_loss)
        #print ('other loss', (policy_loss + args.value_loss_coef * value_loss))
        optimizer.zero_grad()

        ((1 - args.beta) * inverse_loss +
         args.beta * forward_loss).backward(retain_graph=True)
        (args.lmbda * (policy_loss + 0.5 * value_loss)).backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
示例#6
0
def test(testing_scene,
         test_object,
         rank,
         shared_model,
         results,
         config,
         arguments=dict()):
    torch.manual_seed(arguments['seed'] + rank)

    env = AI2ThorDumpEnv(testing_scene, test_object, config, arguments,
                         arguments['seed'] + rank)
    print("Finding {} in {}, {}".format(test_object, testing_scene,
                                        env.target_locs))

    if shared_model is not None:
        gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])]
        # gpu_id = -1

        model = ActorCritic(config, arguments, gpu_id)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                model = model.cuda()
                model.load_state_dict(shared_model.state_dict())

                print("[P{}] loaded model into cuda {}".format(rank, gpu_id))
        else:
            model.load_state_dict(shared_model.state_dict())
            print("[P{}] loaded model".format(rank))

        model.eval()

    state, score, target = env.reset()
    done = True

    starting = env.current_state_id
    results[rank] = 0

    for ep in range(1000):
        agent_step = 0
        for step in range(arguments['num_iters']):
            if model is not None:
                with torch.no_grad():
                    value, logit = model(state, score, target)
                prob = F.softmax(logit, dim=-1)
                action = prob.max(1, keepdim=True)[1].cpu().numpy()
                # action = prob.multinomial(num_samples=1).detach().cpu().numpy()[0, 0]

            else:
                action = np.random.choice(range(arguments['action_size']))

            state, score, reward, done = env.step(action)
            ending = env.current_state_id

            if action < 2:
                agent_step += 1

            if done:
                results[rank] += env.shortest[ending, starting] / max(
                    agent_step, env.shortest[ending, starting])
                state, score, target = env.reset()
                break

    results[rank] = results[rank] / 1000
示例#7
0
文件: Train.py 项目: Ameyapores/Mario
def train(rank,
          args,
          shared_model,
          counter,
          lock,
          optimizer=None,
          select_sample=True):

    FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor

    env = setup_env(args.env_name)

    model = ActorCritic(1, env.action_space.n)

    if args.use_cuda:
        model.cuda()

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = prepro(env.reset())
    state = torch.from_numpy(state)

    done = True
    episode_length = 0
    for num_iter in count():

        if rank == 0:

            if num_iter % args.save_interval == 0 and num_iter > 0:
                #print ("Saving model at :" + args.save_path)
                torch.save(shared_model.state_dict(), args.save_path)

        if num_iter % (
                args.save_interval * 2.5
        ) == 0 and num_iter > 0 and rank == 1:  # Second saver in-case first processes crashes
            #print ("Saving model for process 1 at :" + args.save_path)
            torch.save(shared_model.state_dict(), args.save_path)

        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 512)).type(FloatTensor)
            hx = Variable(torch.zeros(1, 512)).type(FloatTensor)
        else:
            cx = Variable(cx.data).type(FloatTensor)
            hx = Variable(hx.data).type(FloatTensor)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            state_inp = Variable(state.unsqueeze(0)).type(FloatTensor)
            value, logit, (hx, cx) = model((state_inp, (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(-1, keepdim=True)
            entropies.append(entropy)

            if select_sample:
                action = prob.multinomial(num_samples=1).data
            else:
                action = prob.max(-1, keepdim=True)[1].data

            log_prob = log_prob.gather(-1, Variable(action))

            action_out = action.to(torch.device("cpu"))

            state, reward, done, _ = env.step(action_out.numpy()[0][0])
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            with lock:
                counter.value += 1

            if done:
                episode_length = 0
                #env.change_level(0)
                state = torch.from_numpy(prepro(env.reset()))
                #print ("Process {} has completed.".format(rank))

            env.locked_levels = [False] + [True] * 31
            state = torch.from_numpy(prepro(state))
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(0.001 * reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            state_inp = Variable(state.unsqueeze(0)).type(FloatTensor)
            value, _, _ = model((state_inp, (hx, cx)))
            R = value.data

        values.append(Variable(R).type(FloatTensor))
        policy_loss = 0
        value_loss = 0
        R = Variable(R).type(FloatTensor)
        gae = torch.zeros(1, 1).type(FloatTensor)

        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae).type(FloatTensor) - args.entropy_coef * entropies[i]

        total_loss = policy_loss + args.value_loss_coef * value_loss
        optimizer.zero_grad()

        (total_loss).backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
示例#8
0
def test(rank, args, shared_model, counter):

    FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor

    env = setup_env(args.env_name)

    model = ActorCritic(1, env.action_space.n)
    if args.use_cuda:
        model.cuda()
    model.eval()

    state = prepro(env.reset())

    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    savefile = os.getcwd() + '/save/mario_curves.csv'

    title = ['Time', 'No. Steps', 'Total Reward', 'Episode Length']
    with open(savefile, 'a', newline='') as sfile:
        writer = csv.writer(sfile)
        writer.writerow(title)

    start_time = time.time()

    actions = deque(maxlen=4000)
    episode_length = 0
    while True:
        episode_length += 1
        ep_start_time = time.time()
        if done:
            model.load_state_dict(shared_model.state_dict())
            with torch.no_grad():
                cx = Variable(torch.zeros(1, 512)).type(FloatTensor)
                hx = Variable(torch.zeros(1, 512)).type(FloatTensor)

        else:
            with torch.no_grad():
                cx = Variable(cx.data).type(FloatTensor)
                hx = Variable(hx.data).type(FloatTensor)
        with torch.no_grad():
            state_inp = Variable(state.unsqueeze(0)).type(FloatTensor)
        value, logit, (hx, cx) = model((state_inp, (hx, cx)), False)
        prob = F.softmax(logit, dim=-1)
        action = prob.max(-1, keepdim=True)[1].data
        action_out = action.to(torch.device("cpu"))

        state, reward, done, _ = env.step(action_out.numpy()[0][0])
        env.render()
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        actions.append(action[0][0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print(
                "Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    counter.value, counter.value / (time.time() - start_time),
                    reward_sum, episode_length))

            data = [
                time.time() - ep_start_time, counter.value, reward_sum,
                episode_length
            ]

            with open(savefile, 'a', newline='') as sfile:
                writer = csv.writer(sfile)
                writer.writerows([data])

            reward_sum = 0
            episode_length = 0
            actions.clear()
            time.sleep(180)
            state = prepro(env.reset())
        state = torch.from_numpy(prepro(state))
示例#9
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])

    actor_critic = ActorCritic(
        envs.observation_space.shape[0] * args.num_stack, envs.action_space)

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)
    #optimizer = KFACOptimizer(actor_critic, damping=1e-2, kl_clip=0.01, stat_decay=0.99)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, obs_shape[1], obs_shape[2])

    states = torch.zeros(args.num_steps + 1, args.num_processes, *obs_shape)
    current_state = torch.zeros(args.num_processes, *obs_shape)
    counts = 0

    def update_current_state(state):
        state = torch.from_numpy(np.stack(state)).float()
        current_state[:, :-1] = current_state[:, 1:]
        current_state[:, -1] = state

    state = envs.reset()
    update_current_state(state)

    rewards = torch.zeros(args.num_steps, args.num_processes, 1)
    returns = torch.zeros(args.num_steps + 1, args.num_processes, 1)

    actions = torch.LongTensor(args.num_steps, args.num_processes)
    masks = torch.zeros(args.num_steps, args.num_processes, 1)

    # These variables are used to compute average rewards for all processes.
    # Note that rewards are clipped so you need to use a monitor (see envs.py)
    # to get true rewards.
    total_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        states = states.cuda()
        current_state = current_state.cuda()
        rewards = rewards.cuda()
        returns = returns.cuda()
        actions = actions.cuda()
        masks = masks.cuda()

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            _, logits = actor_critic(Variable(states[step], volatile=True))
            probs = F.softmax(logits)
            log_probs = F.log_softmax(logits).data
            actions[step] = probs.multinomial().data

            cpu_actions = actions[step].cpu()
            cpu_actions = cpu_actions.numpy()

            # Obser reward and next state
            state, reward, done, info = envs.step(cpu_actions)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            total_rewards += reward

            np_masks = np.array([0.0 if done_ else 1.0 for done_ in done])

            # If done then clean the history of observations.
            pt_masks = torch.from_numpy(
                np_masks.reshape(np_masks.shape[0], 1, 1, 1)).float()
            if args.cuda:
                pt_masks = pt_masks.cuda()
            current_state *= pt_masks

            update_current_state(state)
            states[step + 1].copy_(current_state)
            rewards[step].copy_(reward)
            masks[step].copy_(torch.from_numpy(np_masks))

            final_rewards *= masks[step].cpu()
            final_rewards += (1 - masks[step].cpu()) * total_rewards

            total_rewards *= masks[step].cpu()

        # Reshape to do in a single forward pass for all steps
        values, logits = actor_critic(
            Variable(states.view(-1,
                                 *states.size()[-3:])))
        log_probs = F.log_softmax(logits)
        probs = F.softmax(logits)

        # Unreshape
        logits_size = (args.num_steps + 1, args.num_processes, logits.size(-1))

        log_probs = F.log_softmax(logits).view(logits_size)[:-1]
        probs = F.softmax(logits).view(logits_size)[:-1]

        values = values.view(args.num_steps + 1, args.num_processes, 1)
        logits = logits.view(logits_size)[:-1]

        action_log_probs = log_probs.gather(2, Variable(actions.unsqueeze(2)))

        dist_entropy = -(log_probs * probs).sum(-1).mean()

        returns[-1] = values[-1].data

        for step in reversed(range(args.num_steps)):
            returns[step] = returns[step + 1] * \
                args.gamma * masks[step] + rewards[step]

        value_loss = (values[:-1] - Variable(returns[:-1])).pow(2).mean()

        advantages = returns[:-1] - values[:-1].data
        action_loss = -(Variable(advantages) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
        optimizer.step()

        states[0].copy_(states[-1])

        if j % args.log_interval == 0:
            print(
                "Updates {}, num frames {}, mean clipped reward {:.5f}, max clipped reward {:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, j * args.num_processes * args.num_steps,
                        final_rewards.mean(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
示例#10
0
def train(training_scene,
          train_object,
          rank,
          shared_model,
          scheduler,
          counter,
          lock,
          config,
          arguments=dict(),
          optimizer=None):
    torch.manual_seed(arguments['seed'] + rank)
    # To prevent out of memory
    if (arguments['train_cnn'] and rank < 10):
        arguments.update({"gpu_ids": [-1]})

    gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])]

    if gpu_id >= 0:
        torch.cuda.manual_seed(arguments['seed'] + rank)

    if optimizer is None:
        optimizer = optim.RMSprop(shared_model.parameters(),
                                  lr=arguments['lr'],
                                  alpha=0.99,
                                  eps=0.1)

    env = AI2ThorDumpEnv(training_scene,
                         train_object,
                         config,
                         arguments,
                         seed=arguments['seed'] + rank)

    state, score, target = env.reset()
    starting = env.current_state_id
    done = True
    print("Done initalizing process {}. Now find {} in {}! Use gpu: {}".format(
        rank, env.target, env.scene, 'yes' if gpu_id >= 0 else 'no'))

    model = ActorCritic(config, arguments, gpu_id)
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            model = model.cuda()
            dtype = torch.cuda.FloatTensor
    else:
        dtype = torch.FloatTensor

    model.train()

    # monitoring
    total_reward_for_num_steps_list = []
    redundancies = []
    success = []
    avg_entropies = []
    learning_rates = []
    dist_to_goal = []

    start = time.time()
    episode_length = 0

    for epoch in range(arguments['num_epochs']):
        # Sync with the shared model
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                model.load_state_dict(shared_model.state_dict())
        else:
            model.load_state_dict(shared_model.state_dict())

        if arguments['lstm']:
            if done:
                cx = torch.zeros(1, 512).type(dtype)
                hx = torch.zeros(1, 512).type(dtype)
            else:
                cx = cx.detach()
                hx = hx.detach()

        if scheduler is not None:
            scheduler.step()
            learning_rates.append(optimizer.param_groups[0]['lr'])

        values = []
        log_probs = []
        rewards = []
        entropies = []
        starting = env.current_state_id

        dist_to_goal.append(
            min([env.shortest[starting][t] for t in env.target_ids]))

        for step in range(arguments['num_iters']):
            episode_length += 1
            if arguments['lstm']:
                value, logit, (hx, cx) = model((state, (hx, cx)), score,
                                               target)
            else:
                value, logit = model(state, score, target)

            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            action_int = action.cpu().numpy()[0][0].item()
            state, score, reward, done = env.step(action_int)

            if done:
                success.append(1)
            elif episode_length >= arguments['max_episode_length']:
                success.append(0)

            done = done or episode_length >= arguments['max_episode_length']

            with lock:
                counter.value += 1

            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            ending = env.current_state_id
            if done:
                state, score, target = env.reset()

                print('[P-{}] Epoch: {}. Episode length: {}. Total reward: {:.3f}. Time elapsed: {:.3f}'\
                        .format(rank, epoch + 1, episode_length, sum(rewards), (time.time() - start) / 3600))

                episode_length = 0
                break

        if not done:
            success.append(0)

        # No interaction with environment below.
        # Monitoring
        total_reward_for_num_steps_list.append(sum(rewards))
        redundancies.append(step + 1 - env.shortest[ending, starting])
        avg_entropies.append(torch.tensor(entropies).numpy().mean())

        # Backprop and optimisation
        R = torch.zeros(1, 1)
        if not done:  # to change last reward to predicted value to ....
            if arguments['lstm']:
                value, _, (hx, cx) = model((state, (hx, cx)), score, target)
            else:
                value, _ = model(state, score, target)

            R = value.detach()

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        values.append(R)

        policy_loss = 0
        value_loss = 0

        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()

        for i in reversed(range(len(rewards))):

            R = arguments['gamma'] * R + rewards[i]

            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            if arguments['use_gae']:
                # Generalized Advantage Estimation
                delta_t = rewards[i] + arguments['gamma'] * values[
                    i + 1] - values[i]
                gae = gae * arguments['gamma'] * arguments['tau'] + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach() - \
                          arguments['ec'] * entropies[i]

        optimizer.zero_grad()

        (policy_loss + arguments['vc'] * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       arguments['max_grad_norm'])

        ensure_shared_grads(model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()

        if (epoch + 1) % 1000 == 0 and np.mean(success[-500:]) >= 0.8 and \
            not os.path.isfile("training-history/{}/net_good.pth".format(arguments['about'])):
            torch.save(
                model.state_dict(),
                "training-history/{}/net_good.pth".format(arguments['about']))

        if (epoch + 1) % 2000 == 0:
            with open(
                    'training-history/{}/{}_{}_{}.pkl'.format(
                        arguments['about'], training_scene, train_object,
                        rank), 'wb') as f:
                pickle.dump(
                    {
                        "rewards": total_reward_for_num_steps_list,
                        "dist_to_goal": dist_to_goal,
                        "success_rate": success,
                        'redundancies': redundancies,
                        "entropies": avg_entropies,
                        'lrs': learning_rates
                    }, f, pickle.HIGHEST_PROTOCOL)

    torch.save(
        model.state_dict(),
        "training-history/{}/net_{}.pth".format(arguments['about'],
                                                train_object))
示例#11
0
def train(rank, params, shared_model, optimizer):
    torch.manual_seed(params.seed + rank) # shifting the seed with rank to asynchronize each training agent
    env = create_atari_env(params.env_name) # creating an optimized environment thanks to the create_atari_env function
    env.seed(params.seed + rank) # aligning the seed of the environment on the seed of the agent
    
    model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating the model from the ActorCritic class
    if params.cuda:
        model.cuda()
    
    state = env.reset() # state is a numpy array of size 1*42*42, in black & white
    
    state = torch.from_numpy(state) # converting the numpy array into a torch tensor
    #print("State: ",state)
    done = True # when the game is done
    episode_length = 0 # initializing the length of an episode to 0
    #print("Unsquuezed: ", Variable(state.unsqueeze(0)))
    
    while True: # repeat
        episode_length += 1 # incrementing the episode length by one
        model.load_state_dict(shared_model.state_dict()) # synchronizing with the shared model - the agent gets the shared model to do an exploration on num_steps
        if done: # if it is the first iteration of the while loop or if the game was just done, then:
            cx = Variable(torch.zeros(1, 256)) # the cell states of the LSTM are reinitialized to zero
            hx = Variable(torch.zeros(1, 256)) # the hidden states of the LSTM are reinitialized to zero
            cx2 = Variable(torch.zeros(1, 256))
            hx2 = Variable(torch.zeros(1, 256))
            if params.cuda:
                cx.cuda()
                hx.cuda()
        else: # else:
            cx = Variable(cx.data) # we keep the old cell states, making sure they are in a torch variable
            hx = Variable(hx.data) # we keep the old hidden states, making sure they are in a torch variable
            cx2 = Variable(cx2.data)
            hx2 = Variable(hx2.data)
            if params.cuda:
                cx.cuda()
                hx.cuda()
        values = [] # initializing the list of values (V(S))
        log_probs = [] # initializing the list of log probabilities
        rewards = [] # initializing the list of rewards
        entropies = [] # initializing the list of entropies
        for step in range(params.num_steps): # going through the num_steps exploration steps
            if params.cuda:
                value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)).cuda(), (hx, cx))) # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states
            else:
                value, action_values, (hx, cx),(hx2,cx2) = model((Variable(state.unsqueeze(0)), (hx, cx),(hx2, cx2)))
            
            
            #print(action_values)
            prob = F.softmax(action_values,1) # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b)))
            log_prob = F.log_softmax(action_values,1) # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a))
            entropy = -(log_prob * prob).sum(1) # H(p) = - sum_x p(x).log(p(x))
            entropies.append(entropy) # storing the computed entropy
            action = prob.multinomial().data # selecting an action by taking a random draw from the prob distribution
            #print(action.numpy())
            log_prob = log_prob.gather(1, Variable(action)) # getting the log prob associated to this selected action
            values.append(value) # storing the value V(S) of the state
            log_probs.append(log_prob) # storing the log prob of the action
            state, reward, done, _ = env.step(action.numpy()) # playing the selected action, reaching the new state, and getting the new reward
            done = (done or episode_length >= params.max_episode_length) # if the episode lasts too long (the agent is stucked), then it is done
            reward = max(min(reward, 1), -1) # clamping the reward between -1 and +1            
            if done: # if the episode is done:
                episode_length = 0 # we restart the environment
                state = env.reset() # we restart the environment
            state = torch.from_numpy(state) # tensorizing the new state
            rewards.append(reward) # storing the new observed reward
            if done: # if we are done
                break # we stop the exploration and we directly move on to the next step: the update of the shared model
        R = torch.zeros(1, 1) # intializing the cumulative reward
        if not done: # if we are not done:
            if params.cuda:
                value, _, _, _ = model((Variable(state.unsqueeze(0)).cuda(), (hx, cx)))
            else:                
                value, _, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx),(hx2,cx2))) # we initialize the cumulative reward with the value of the last shared state
            #print(value)
            R = value.data # we initialize the cumulative reward with the value of the last shared state
        values.append(Variable(R)) # storing the value V(S) of the last reached state S
        policy_loss = 0 # initializing the policy loss
        value_loss = 0 # initializing the value loss
        R = Variable(R) # making sure the cumulative reward R is a torch Variable
        gae = torch.zeros(1, 1) # initializing the Generalized Advantage Estimation to 0
        for i in reversed(range(len(rewards))): # starting from the last exploration step and going back in time
            R = params.gamma * R + rewards[i] # R = gamma*R + r_t = r_0 + gamma r_1 + gamma^2 * r_2 ... + gamma^(n-1)*r_(n-1) + gamma^nb_step * V(last_state)
            advantage = R - values[i] # R is an estimator of Q at time t = i so advantage_i = Q_i - V(state_i) = R - value[i]
            value_loss = value_loss + 0.5 * advantage.pow(2) # computing the value loss
            TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data # computing the temporal difference
            gae = gae * params.gamma * params.tau + TD # gae = sum_i (gamma*tau)^i * TD(i) with gae_i = gae_(i+1)*gamma*tau + (r_i + gamma*V(state_i+1) - V(state_i))
            policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # computing the policy loss
        optimizer.zero_grad() # initializing the optimizer
        (policy_loss + 0.5 * value_loss).backward() # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller
        torch.nn.utils.clip_grad_norm(model.parameters(), 40) # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm
        ensure_shared_grads(model, shared_model) # making sure the model of the agent and the shared model share the same gradient
        optimizer.step() # running the optimization step
class BehavioralEmbeddedAgent(Agent):
    def __init__(self, load_dataset=True):

        super(BehavioralEmbeddedAgent, self).__init__()

        self.meta, self.data = preprocess_demonstrations()

        if load_dataset:
            # demonstration source
            self.meta = divide_dataset(self.meta)

            # datasets
            self.train_dataset = DemonstrationMemory("train", self.meta,
                                                     self.data)
            self.val_dataset = DemonstrationMemory("val", self.meta, self.data)
            self.test_dataset = DemonstrationMemory("test", self.meta,
                                                    self.data)
            self.full_dataset = DemonstrationMemory("full", self.meta,
                                                    self.data)

            self.train_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                           train=True)
            self.val_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                         train=False)
            self.test_sampler = DemonstrationBatchSampler(self.test_dataset,
                                                          train=False)
            self.episodic_sampler = SequentialDemonstrationSampler(
                self.full_dataset)

            self.train_loader = torch.utils.data.DataLoader(
                self.train_dataset,
                batch_sampler=self.train_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)
            self.test_loader = torch.utils.data.DataLoader(
                self.test_dataset,
                batch_sampler=self.test_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)

        self.loss_v_beta = torch.nn.KLDivLoss()
        self.loss_q_beta = torch.nn.KLDivLoss()

        self.loss_v_pi = torch.nn.KLDivLoss()
        self.loss_q_pi = torch.nn.KLDivLoss()

        self.histogram = torch.from_numpy(self.meta['histogram']).float()

        w_f, w_v, w_h = calc_hist_weights(self.histogram)

        w_f = torch.clamp(w_f, 0, 10).cuda()
        w_v = torch.clamp(w_v, 0, 10).cuda()
        w_h = torch.clamp(w_h, 0, 10).cuda()

        self.loss_beta_f = torch.nn.CrossEntropyLoss(size_average=True,
                                                     weight=w_f)
        self.loss_beta_v = torch.nn.CrossEntropyLoss(size_average=True,
                                                     weight=w_v)
        self.loss_beta_h = torch.nn.CrossEntropyLoss(size_average=True,
                                                     weight=w_h)

        self.loss_pi_f = torch.nn.CrossEntropyLoss(size_average=False)
        self.loss_pi_v = torch.nn.CrossEntropyLoss(size_average=False)
        self.loss_pi_h = torch.nn.CrossEntropyLoss(size_average=False)

        self.behavioral_model = BehavioralDistEmbedding()
        self.behavioral_model.cuda()

        # actor critic setting

        self.actor_critic_model = ActorCritic()
        self.actor_critic_model.cuda()

        self.actor_critic_target = ActorCritic()
        self.actor_critic_target.cuda()

        # configure learning

        cnn_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "cnn" in p[0]
        ]
        emb_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "emb" in p[0]
        ]

        v_beta_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_v" in p[0]
        ]
        a_beta_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_adv" in p[0]
        ]

        beta_f_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_beta_f" in p[0]
        ]
        beta_v_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_beta_v" in p[0]
        ]
        beta_h_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_beta_h" in p[0]
        ]

        v_pi_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "critic_v" in p[0]
        ]
        a_pi_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "critic_adv" in p[0]
        ]

        pi_f_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "fc_actor_f" in p[0]
        ]
        pi_v_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "fc_actor_v" in p[0]
        ]
        pi_h_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "fc_actor_h" in p[0]
        ]

        # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER

        self.optimizer_critic_v = BehavioralEmbeddedAgent.set_optimizer(
            v_pi_params, 0.0008)
        self.scheduler_critic_v = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_critic_v, self.decay)

        self.optimizer_critic_q = BehavioralEmbeddedAgent.set_optimizer(
            v_pi_params + a_pi_params, 0.0008)
        self.scheduler_critic_q = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_critic_q, self.decay)

        self.optimizer_v_beta = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + v_beta_params, 0.0008)
        self.scheduler_v_beta = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_v_beta, self.decay)

        self.optimizer_q_beta = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + v_beta_params + a_beta_params, 0.0008)
        self.scheduler_q_beta = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_q_beta, self.decay)

        self.optimizer_beta_f = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + beta_f_params, 0.0008)
        self.scheduler_beta_f = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta_f, self.decay)

        self.optimizer_beta_v = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + beta_v_params, 0.0008)
        self.scheduler_beta_v = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta_v, self.decay)

        self.optimizer_beta_h = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + beta_h_params, 0.0008)
        self.scheduler_beta_h = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta_h, self.decay)

        self.optimizer_pi_f = BehavioralEmbeddedAgent.set_optimizer(
            pi_f_params, 0.0008)
        self.scheduler_pi_f = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_f, self.decay)

        self.optimizer_pi_v = BehavioralEmbeddedAgent.set_optimizer(
            pi_v_params, 0.0008)
        self.scheduler_pi_v = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_v, self.decay)

        self.optimizer_pi_h = BehavioralEmbeddedAgent.set_optimizer(
            pi_h_params, 0.0008)
        self.scheduler_pi_h = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_h, self.decay)

        actions = torch.LongTensor(consts.hotvec_matrix).cuda()
        self.actions_matrix = actions.unsqueeze(0)

        self.q_bins = consts.q_bins[args.game][:-1] / self.meta['avg_score']
        # the long bins are already normalized
        self.v_bins = consts.v_bins[args.game][:-1] / self.meta['avg_score']

        self.q_bins_torch = Variable(torch.from_numpy(
            consts.q_bins[args.game] / self.meta['avg_score']),
                                     requires_grad=False).cuda()
        self.v_bins_torch = Variable(torch.from_numpy(
            consts.v_bins[args.game] / self.meta['avg_score']),
                                     requires_grad=False).cuda()

        self.batch_range = np.arange(self.batch)

        self.zero = Variable(torch.zeros(1))

    def flip_grad(self, parameters):
        for p in parameters:
            p.requires_grad = not p.requires_grad

    @staticmethod
    def individual_loss_fn_l2(argument):
        return abs(argument.data.cpu().numpy())**2

    @staticmethod
    def individual_loss_fn_l1(argument):
        return abs(argument.data.cpu().numpy())

    def save_checkpoint(self, path, aux=None):

        state = {
            'behavioral_model': self.behavioral_model.state_dict(),
            'actor_critic_model': self.actor_critic_model.state_dict(),
            'optimizer_critic_v': self.optimizer_critic_v.state_dict(),
            'optimizer_critic_q': self.optimizer_critic_q.state_dict(),
            'optimizer_v_beta': self.optimizer_v_beta.state_dict(),
            'optimizer_q_beta': self.optimizer_q_beta.state_dict(),
            'optimizer_beta_f': self.optimizer_beta_f.state_dict(),
            'optimizer_beta_v': self.optimizer_beta_v.state_dict(),
            'optimizer_beta_h': self.optimizer_beta_h.state_dict(),
            'optimizer_pi_f': self.optimizer_pi_f.state_dict(),
            'optimizer_pi_v': self.optimizer_pi_v.state_dict(),
            'optimizer_pi_h': self.optimizer_pi_h.state_dict(),
            'aux': aux
        }

        torch.save(state, path)

    def load_checkpoint(self, path):

        state = torch.load(path)
        self.behavioral_model.load_state_dict(state['behavioral_model'])
        self.actor_critic_model.load_state_dict(state['actor_critic_model'])
        self.optimizer_critic_v.load_state_dict(state['optimizer_critic_v'])
        self.optimizer_critic_q.load_state_dict(state['optimizer_critic_q'])
        self.optimizer_v_beta.load_state_dict(state['optimizer_v_beta'])
        self.optimizer_q_beta.load_state_dict(state['optimizer_q_beta'])
        self.optimizer_beta_f.load_state_dict(state['optimizer_beta_f'])
        self.optimizer_beta_v.load_state_dict(state['optimizer_beta_v'])
        self.optimizer_beta_h.load_state_dict(state['optimizer_beta_h'])
        self.optimizer_pi_f.load_state_dict(state['optimizer_pi_f'])
        self.optimizer_pi_v.load_state_dict(state['optimizer_pi_v'])
        self.optimizer_pi_h.load_state_dict(state['optimizer_pi_h'])

        return state['aux']

    def resume(self, model_path):

        aux = self.load_checkpoint(model_path)
        # self.update_target()
        return aux

    def update_target(self):
        self.actor_critic_target.load_state_dict(
            self.actor_critic_model.state_dict())

    def batched_interp(self, x, xp, fp):
        # implemented with numpy
        x = x.data.cpu().numpy()
        xp = xp.data.cpu().numpy()
        fp = fp.data.cpu().numpy()
        y = np.zeros(x.shape)

        for i, (xl, xpl, fpl) in enumerate(zip(x, xp, fp)):
            y[i] = np.interp(xl, xpl, fpl)

        return Variable(torch.FloatTensor().cuda(), requires_grad=False)

    def new_distribution(self, q, beta, r, bin):
        bin = bin.repeat(self.batch, self.global_action_space, 1)
        r = r.unsqueeze(1).repeat(1, bin.shape[0])
        beta = beta.unsqueeze(1)

        # dimensions:
        # bins [batch, actions, bins]
        # beta [batch, 1, actions]
        # new_bin = torch.baddbmm(r, beta, , alpha=self.discount)
        q_back.squeeze(1)
        return self.batched_interp(x, xp, fp)

    def learn(self, n_interval, n_tot):

        self.behavioral_model.train()
        self.actor_critic_model.train()
        self.actor_critic_target.eval()

        results = {
            'n': [],
            'loss_v': [],
            'loss_q': [],
            'loss_beta_f': [],
            'loss_beta_v': [],
            'loss_beta_h': [],
            'loss_pi_s': [],
            'loss_pi_l': [],
            'loss_pi_s_tau': [],
            'loss_pi_l_tau': []
        }

        for n, sample in tqdm(enumerate(self.train_loader)):

            s = Variable(sample['s'].cuda(), requires_grad=False)
            a = Variable(sample['a'].cuda(), requires_grad=False)

            a_index = Variable(sample['a_index'].cuda(async=True),
                               requires_grad=False)

            rl = np.digitize(sample['score'].numpy(),
                             self.long_bins,
                             right=True)
            rs = np.digitize(sample['f'].numpy(), self.short_bins, right=True)

            Rl = Variable(sample['score'].cuda(), requires_grad=False)
            Rs = Variable(sample['f'].cuda(), requires_grad=False)

            rl = Variable(torch.from_numpy(rl).cuda(), requires_grad=False)
            rs = Variable(torch.from_numpy(rs).cuda(), requires_grad=False)

            vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model(
                s, a)

            # policy learning

            if self.alpha_vs and train_net:
                loss_vs = self.alpha_vs * self.loss_fn_vs(vs, rs)
                self.optimizer_vs.zero_grad()
                loss_vs.backward(retain_graph=True)
                self.optimizer_vs.step()
            else:
                loss_vs = self.zero

            if self.alpha_vl and train_net:
                loss_vl = self.alpha_vl * self.loss_fn_vl(vl, rl)
                self.optimizer_vl.zero_grad()
                loss_vl.backward(retain_graph=True)
                self.optimizer_vl.step()
            else:
                loss_vl = self.zero

            if self.alpha_b and train_net:
                loss_b = self.alpha_b * self.loss_fn_beta(beta, a_index)
                self.optimizer_beta.zero_grad()
                loss_b.backward(retain_graph=True)
                self.optimizer_beta.step()
            else:
                loss_b = self.zero

            if self.alpha_qs and train_net:
                loss_qs = self.alpha_qs * self.loss_fn_qs(qs, rs)
                self.optimizer_qs.zero_grad()
                loss_qs.backward(retain_graph=True)
                self.optimizer_qs.step()
            else:
                loss_qs = self.zero

            if self.alpha_ql and train_net:
                loss_ql = self.alpha_ql * self.loss_fn_ql(ql, rl)
                self.optimizer_ql.zero_grad()
                loss_ql.backward(retain_graph=True)
                self.optimizer_ql.step()
            else:
                loss_ql = self.zero

            a_index_np = sample['a_index'].numpy()
            self.batch_range = np.arange(self.batch)

            beta_sfm = F.softmax(beta, 1)
            pi_s_sfm = F.softmax(pi_s, 1)
            pi_l_sfm = F.softmax(pi_l, 1)
            pi_s_tau_sfm = F.softmax(pi_s, 1)
            pi_l_tau_sfm = F.softmax(pi_l, 1)

            beta_fix = Variable(beta_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_s_fix = Variable(pi_s_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_l_fix = Variable(pi_l_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_s_tau_fix = Variable(pi_s_tau_sfm.data[self.batch_range,
                                                      a_index_np],
                                    requires_grad=False)
            pi_l_tau_fix = Variable(pi_l_tau_sfm.data[self.batch_range,
                                                      a_index_np],
                                    requires_grad=False)

            if self.alpha_pi_s and not train_net:
                loss_pi_s = self.alpha_pi_s * self.loss_fn_pi_s(pi_s, a_index)
                loss_pi_s = (loss_pi_s * Rs *
                             self.off_factor(pi_s_fix, beta_fix)).mean()
                self.optimizer_pi_s.zero_grad()
                loss_pi_s.backward(retain_graph=True)
                self.optimizer_pi_s.step()
            else:
                loss_pi_s = self.zero

            if self.alpha_pi_l and not train_net:
                loss_pi_l = self.alpha_pi_l * self.loss_fn_pi_l(pi_l, a_index)
                loss_pi_l = (loss_pi_l * Rl *
                             self.off_factor(pi_l_fix, beta_fix)).mean()
                self.optimizer_pi_l.zero_grad()
                loss_pi_l.backward(retain_graph=True)
                self.optimizer_pi_l.step()
            else:
                loss_pi_l = self.zero

            if self.alpha_pi_s_tau and not train_net:
                loss_pi_s_tau = self.alpha_pi_s_tau * self.loss_fn_pi_s_tau(
                    pi_s_tau, a_index)
                w = self.get_weighted_loss(F.softmax(qs, 1),
                                           self.short_bins_torch)
                loss_pi_s_tau = (
                    loss_pi_s_tau * w *
                    self.off_factor(pi_s_tau_fix, beta_fix)).mean()
                self.optimizer_pi_s_tau.zero_grad()
                loss_pi_s_tau.backward(retain_graph=True)
                self.optimizer_pi_s_tau.step()
            else:
                loss_pi_s_tau = self.zero

            if self.alpha_pi_l_tau and not train_net:
                loss_pi_l_tau = self.alpha_pi_l_tau * self.loss_fn_pi_l_tau(
                    pi_l_tau, a_index)
                w = self.get_weighted_loss(F.softmax(ql, 1),
                                           self.long_bins_torch)
                loss_pi_l_tau = (
                    loss_pi_l_tau * w *
                    self.off_factor(pi_l_tau_fix, beta_fix)).mean()
                self.optimizer_pi_l_tau.zero_grad()
                loss_pi_l_tau.backward()
                self.optimizer_pi_l_tau.step()
            else:
                loss_pi_l_tau = self.zero

            # add results
            results['loss_vs'].append(loss_vs.data.cpu().numpy()[0])
            results['loss_vl'].append(loss_vl.data.cpu().numpy()[0])
            results['loss_b'].append(loss_b.data.cpu().numpy()[0])
            results['loss_qs'].append(loss_qs.data.cpu().numpy()[0])
            results['loss_ql'].append(loss_ql.data.cpu().numpy()[0])
            results['loss_pi_s'].append(loss_pi_s.data.cpu().numpy()[0])
            results['loss_pi_l'].append(loss_pi_l.data.cpu().numpy()[0])
            results['loss_pi_s_tau'].append(
                loss_pi_s_tau.data.cpu().numpy()[0])
            results['loss_pi_l_tau'].append(
                loss_pi_l_tau.data.cpu().numpy()[0])
            results['n'].append(n)

            # if not n % self.update_target_interval:
            #     # self.update_target()

            # if an index is rolled more than once during update_memory_interval period, only the last occurance affect the
            if not (
                    n + 1
            ) % self.update_memory_interval and self.prioritized_replay:
                self.train_dataset.update_probabilities()

            # update a global n_step parameter

            if not (n + 1) % self.update_n_steps_interval:
                # self.train_dataset.update_n_step(n + 1)
                d = np.divmod(n + 1, self.update_n_steps_interval)[0]
                if d % 10 == 1:
                    self.flip_grad(self.parameters_group_b +
                                   self.parameters_group_a)
                    train_net = not train_net
                if d % 10 == 2:
                    self.flip_grad(self.parameters_group_b +
                                   self.parameters_group_a)
                    train_net = not train_net

                    self.scheduler_pi_s.step()
                    self.scheduler_pi_l.step()
                    self.scheduler_pi_s_tau.step()
                    self.scheduler_pi_l_tau.step()
                else:
                    self.scheduler_vs.step()
                    self.scheduler_beta.step()
                    self.scheduler_vl.step()
                    self.scheduler_qs.step()
                    self.scheduler_ql.step()

            if not (n + 1) % n_interval:
                yield results
                self.model.train()
                # self.target.eval()
                results = {key: [] for key in results}

    def off_factor(self, pi, beta):
        return torch.clamp(pi / beta, 0, 1)

    def test(self, n_interval, n_tot):

        self.model.eval()
        # self.target.eval()

        results = {
            'n': [],
            'loss_vs': [],
            'loss_b': [],
            'loss_vl': [],
            'loss_qs': [],
            'loss_ql': [],
            'act_diff': [],
            'a_agent': [],
            'a_player': [],
            'loss_pi_s': [],
            'loss_pi_l': [],
            'loss_pi_s_tau': [],
            'loss_pi_l_tau': []
        }

        for n, sample in tqdm(enumerate(self.test_loader)):

            s = Variable(sample['s'].cuda(), requires_grad=False)
            a = Variable(sample['a'].cuda().unsqueeze(1), requires_grad=False)

            a_index = Variable(sample['a_index'].cuda(async=True),
                               requires_grad=False)

            rl = np.digitize(sample['score'].numpy(),
                             self.long_bins,
                             right=True)
            rs = np.digitize(sample['f'].numpy(), self.short_bins, right=True)

            Rl = Variable(sample['score'].cuda(), requires_grad=False)
            Rs = Variable(sample['f'].cuda(), requires_grad=False)

            rl = Variable(torch.from_numpy(rl).cuda(), requires_grad=False)
            rs = Variable(torch.from_numpy(rs).cuda(), requires_grad=False)

            vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model(
                s, a)

            qs = qs.squeeze(1)
            ql = ql.squeeze(1)

            # policy learning

            loss_vs = self.alpha_vs * self.loss_fn_vs(vs, rs)
            loss_vl = self.alpha_vl * self.loss_fn_vl(vl, rl)
            loss_b = self.alpha_b * self.loss_fn_beta(beta, a_index)
            loss_qs = self.alpha_qs * self.loss_fn_qs(qs, rs)
            loss_ql = self.alpha_ql * self.loss_fn_ql(ql, rl)

            a_index_np = sample['a_index'].numpy()
            self.batch_range = np.arange(self.batch)

            beta_sfm = F.softmax(beta, 1)
            pi_s_sfm = F.softmax(pi_s, 1)
            pi_l_sfm = F.softmax(pi_l, 1)
            pi_s_tau_sfm = F.softmax(pi_s, 1)
            pi_l_tau_sfm = F.softmax(pi_l, 1)

            beta_fix = Variable(beta_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_s_fix = Variable(pi_s_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_l_fix = Variable(pi_l_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_s_tau_fix = Variable(pi_s_tau_sfm.data[self.batch_range,
                                                      a_index_np],
                                    requires_grad=False)
            pi_l_tau_fix = Variable(pi_l_tau_sfm.data[self.batch_range,
                                                      a_index_np],
                                    requires_grad=False)

            loss_pi_s = self.alpha_pi_s * self.loss_fn_pi_s(pi_s, a_index)
            loss_pi_s = (loss_pi_s * Rs *
                         self.off_factor(pi_s_fix, beta_fix)).mean()

            loss_pi_l = self.alpha_pi_l * self.loss_fn_pi_l(pi_l, a_index)
            loss_pi_l = (loss_pi_l * Rl *
                         self.off_factor(pi_l_fix, beta_fix)).mean()

            loss_pi_s_tau = self.alpha_pi_s_tau * self.loss_fn_pi_s_tau(
                pi_s_tau, a_index)
            w = self.get_weighted_loss(F.softmax(qs, 1), self.short_bins_torch)
            loss_pi_s_tau = (loss_pi_s_tau * w *
                             self.off_factor(pi_s_tau_fix, beta_fix)).mean()

            loss_pi_l_tau = self.alpha_pi_l_tau * self.loss_fn_pi_l_tau(
                pi_l_tau, a_index)
            w = self.get_weighted_loss(F.softmax(ql, 1), self.long_bins_torch)
            loss_pi_l_tau = (loss_pi_l_tau * w *
                             self.off_factor(pi_l_tau_fix, beta_fix)).mean()

            # collect actions statistics
            a_index_np = a_index.data.cpu().numpy()

            _, beta_index = beta.data.cpu().max(1)
            beta_index = beta_index.numpy()
            act_diff = (a_index_np != beta_index).astype(np.int)

            # add results
            results['act_diff'].append(act_diff)
            results['a_agent'].append(beta_index)
            results['a_player'].append(a_index_np)
            results['loss_vs'].append(loss_vs.data.cpu().numpy()[0])
            results['loss_vl'].append(loss_vl.data.cpu().numpy()[0])
            results['loss_b'].append(loss_b.data.cpu().numpy()[0])
            results['loss_qs'].append(loss_qs.data.cpu().numpy()[0])
            results['loss_ql'].append(loss_ql.data.cpu().numpy()[0])
            results['loss_pi_s'].append(loss_pi_s.data.cpu().numpy()[0])
            results['loss_pi_l'].append(loss_pi_l.data.cpu().numpy()[0])
            results['loss_pi_s_tau'].append(
                loss_pi_s_tau.data.cpu().numpy()[0])
            results['loss_pi_l_tau'].append(
                loss_pi_l_tau.data.cpu().numpy()[0])
            results['n'].append(n)

            if not (n + 1) % n_interval:
                results['s'] = s.data.cpu()
                results['act_diff'] = np.concatenate(results['act_diff'])
                results['a_agent'] = np.concatenate(results['a_agent'])
                results['a_player'] = np.concatenate(results['a_player'])
                yield results
                self.model.eval()
                # self.target.eval()
                results = {key: [] for key in results}

    def play_stochastic(self, n_tot):
        raise NotImplementedError

    def play_episode(self, n_tot):

        self.model.eval()
        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        # mask = torch.FloatTensor(consts.actions_mask[args.game])
        # mask = Variable(mask.cuda(), requires_grad=False)

        vsx = torch.FloatTensor(consts.short_bins[args.game])
        vlx = torch.FloatTensor(consts.long_bins[args.game])

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)
                vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model(
                    s, self.actions_matrix)
                beta = beta.squeeze(0)
                pi_l = pi_l.squeeze(0)
                pi_s = pi_s.squeeze(0)
                pi_l_tau = pi_l_tau.squeeze(0)
                pi_s_tau = pi_s_tau.squeeze(0)

                temp = 1

                # consider only 3 most frequent actions
                beta_np = beta.data.cpu().numpy()
                indices = np.argsort(beta_np)

                maskb = Variable(torch.FloatTensor(
                    [0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                                 requires_grad=False).cuda()
                # maskb = Variable(torch.FloatTensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                #                  requires_grad=False).cuda()

                # pi = maskb * (beta / beta.max())

                pi = beta
                self.greedy = True

                beta_prob = pi

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    eps = np.random.rand()
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.1:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi / temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                env.step(a)

                vs = softmax(vs)
                vl = softmax(vl)
                vs = torch.sum(vsx * vs.data.cpu())
                vl = torch.sum(vlx * vl.data.cpu())

                yield {
                    'o': env.s.cpu().numpy(),
                    'vs': np.array([vs]),
                    'vl': np.array([vl]),
                    's': phi.data.cpu().numpy(),
                    'score': env.score,
                    'beta': beta_prob.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy(),
                    'qs': qs.squeeze(0).data.cpu().numpy(),
                    'ql': ql.squeeze(0).data.cpu().numpy(),
                }

                j += 1

        raise StopIteration

    def policy(self, vs, vl, beta, qs, ql):
        pass
示例#13
0
# 학습 시키기
env = Environment1(train)
env.reset()

input_size = env.history_t + 1
output_size = 3
USE_CUDA = False
LR = 0.001

torch.manual_seed(0)

Q = ActorCritic(input_size, output_size)
Q_ast = copy.deepcopy(Q)

if USE_CUDA:
    Q = Q.cuda()
loss_function = nn.MSELoss()
optimizer = optim.Adam(list(Q.parameters()), lr=LR)

epoch_num = 50
step_max = len(env.data) - 1
memory_size = 200
batch_size = 50
gamma = 0.97

obs, reward, done = env.step(5)

memory = []
total_step = 0
total_rewards = []
total_losses = []
示例#14
0
def train(rank, args, shared_model, optimizer=None):

    mse_loss = torch.nn.MSELoss()
    nll_loss = torch.nn.NLLLoss()

    torch.manual_seed(args.seed + rank)

    env = env_wrapper.create_doom(args.record, outdir=args.outdir)
    num_outputs = env.action_space.n
    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    model.cuda()
    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0

    while True:
        episode_length += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)
        values = []
        log_probs = []
        rewards = []
        entropies = []

        inverses = []
        forwards = []
        actions = []
        vec_st1s = []

        for step in range(args.num_steps):
            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)).cuda(), (hx.cuda(), cx.cuda())),
                icm=False)
            s_t = state
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)
            # sample an action
            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            oh_action = torch.Tensor(1, num_outputs)
            oh_action.zero_()

            oh_action.scatter_(1, action.cpu(), 1)
            oh_action = Variable(oh_action).cuda()
            a_t = oh_action
            actions.append(oh_action)
            state, reward, done, _ = env.step(action.cpu().numpy()[0][0])
            if done:
                #print 'total reward', _['TOTAL_REWARD']
                print 'reward ', reward
                #print 'kill count', _['KILLCOUNT']

            state = torch.from_numpy(state)
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)
            s_t1 = state
            vec_st1, inverse, forward = model(
                (Variable(s_t.unsqueeze(0)).cuda(), Variable(
                    s_t1.unsqueeze(0)).cuda(), a_t),
                icm=True)

            reward_intrinsic = args.eta * (
                (vec_st1 - forward).pow(2)).sum(1) / 2.
            #reward_intrinsic = args.eta * ((vec_st1 - forward).pow(2)).sum(1).sqrt() / 2.
            reward_intrinsic = reward_intrinsic.cpu().data.numpy()[0]
            reward += reward_intrinsic

            if done:
                print 'done at ', episode_length * args.num_steps
                episode_length = 0
                state = env.reset()
                state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            vec_st1s.append(vec_st1)
            inverses.append(inverse)
            forwards.append(forward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model(
                (Variable(state.unsqueeze(0)).cuda(), (hx.cuda(), cx.cuda())),
                icm=False)
            R = value.cpu().data

        values.append(Variable(R).cuda())
        policy_loss = 0
        value_loss = 0
        inverse_loss = 0
        forward_loss = 0

        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i].cpu()
            #print value_loss
            value_loss = value_loss + 0.5 * advantage.pow(2)
            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].cpu().data - values[i].cpu().data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i].cpu() * Variable(gae) - 0.01 * entropies[i].cpu()

            cross_entropy = -(actions[i] *
                              torch.log(inverses[i] + 1e-15)).sum(1)
            inverse_loss = inverse_loss + cross_entropy
            forward_err = forwards[i] - vec_st1s[i]
            forward_loss = forward_loss + 0.5 * (forward_err.pow(2)).sum(1)

        optimizer.zero_grad()

        ((1 - args.beta) * inverse_loss +
         args.beta * forward_loss).backward(retain_variables=True)
        (args.lmbda * (policy_loss + 0.5 * value_loss)).backward()

        #(((1-args.beta) * inverse_loss + args.beta * forward_loss) + args.lmbda * (policy_loss + 0.5 * value_loss)).backward()

        torch.nn.utils.clip_grad_norm(model.parameters(), 40)
        if (episode_length + 1) % 50 == 0:
            log = 'step %d: forward loss %.5f, inverse loss %.5f, cross_entropy %.5f \n' % (
                episode_length, forward_loss, inverse_loss, cross_entropy)
            print(log)
        ensure_shared_grads(model, shared_model)
        optimizer.step()
示例#15
0
文件: main.py 项目: mufeili/CVAE_PG
def main():
    time_str = time.strftime("%Y%m%d-%H%M%S")
    print('time_str: ', time_str)

    exp_count = 0

    if args.experiment == 'a|s':
        direc_name_ = '_'.join([args.env, args.experiment])
    else:
        direc_name_ = '_'.join(
            [args.env, args.experiment, 'bp2VAE',
             str(args.bp2VAE)])

    direc_name_exist = True

    while direc_name_exist:
        exp_count += 1
        direc_name = '/'.join([direc_name_, str(exp_count)])
        direc_name_exist = os.path.exists(direc_name)

    try:
        os.makedirs(direc_name)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    if args.tensorboard_dir is None:
        logger = Logger('/'.join([direc_name, time_str]))
    else:
        logger = Logger(args.tensorboard_dir)

    env = gym.make(args.env)

    if args.wrapper:
        if args.video_dir is None:
            args.video_dir = '/'.join([direc_name, 'videos'])
        env = gym.wrappers.Monitor(env, args.video_dir, force=True)

    print('observation_space: ', env.observation_space)
    print('action_space: ', env.action_space)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    if args.experiment == 'a|s':
        dim_x = env.observation_space.shape[0]
    elif args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)' or \
            args.experiment == 'a|z(a_prev, s, s_next)':
        dim_x = args.z_dim

    policy = ActorCritic(input_size=dim_x,
                         hidden1_size=3 * dim_x,
                         hidden2_size=6 * dim_x,
                         action_size=env.action_space.n)

    if args.use_cuda:
        Tensor = torch.cuda.FloatTensor
        torch.cuda.manual_seed_all(args.seed)
        policy.cuda()
    else:
        Tensor = torch.FloatTensor

    policy_optimizer = optim.Adam(policy.parameters(), lr=args.policy_lr)

    if args.experiment != 'a|s':
        from util import ReplayBuffer, vae_loss_function

        dim_s = env.observation_space.shape[0]

        if args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)':
            from model import VAE
            vae = VAE(input_size=dim_s,
                      hidden1_size=3 * args.z_dim,
                      hidden2_size=args.z_dim)

        elif args.experiment == 'a|z(a_prev, s, s_next)':
            from model import CVAE
            vae = CVAE(input_size=dim_s,
                       class_size=1,
                       hidden1_size=3 * args.z_dim,
                       hidden2_size=args.z_dim)

        if args.use_cuda:
            vae.cuda()
        vae_optimizer = optim.Adam(vae.parameters(), lr=args.vae_lr)

        if args.experiment == 'a|z(s)':
            from util import Transition_S2S as Transition
        elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)':
            from util import Transition_S2SNext as Transition

        buffer = ReplayBuffer(args.buffer_capacity, Transition)

        update_vae = True

    if args.experiment == 'a|s':
        from util import Record_S
    elif args.experiment == 'a|z(s)':
        from util import Record_S2S
    elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)':
        from util import Record_S2SNext

    def train_actor_critic(n):
        saved_info = policy.saved_info

        R = 0
        cum_returns_ = []

        for r in policy.rewards[::-1]:
            R = r + args.gamma * R
            cum_returns_.insert(0, R)

        cum_returns = Tensor(cum_returns_)
        cum_returns = (cum_returns - cum_returns.mean()) \
                      / (cum_returns.std() + np.finfo(np.float32).eps)
        cum_returns = Variable(cum_returns, requires_grad=False).unsqueeze(1)

        batch_info = SavedInfo(*zip(*saved_info))
        batch_log_prob = torch.cat(batch_info.log_prob)
        batch_value = torch.cat(batch_info.value)

        batch_adv = cum_returns - batch_value
        policy_loss = -torch.sum(batch_log_prob * batch_adv)
        value_loss = F.smooth_l1_loss(batch_value,
                                      cum_returns,
                                      size_average=False)

        policy_optimizer.zero_grad()
        total_loss = policy_loss + value_loss
        total_loss.backward()
        policy_optimizer.step()

        if args.use_cuda:
            logger.scalar_summary('value_loss', value_loss.data.cpu()[0], n)
            logger.scalar_summary('policy_loss', policy_loss.data.cpu()[0], n)

            all_value_loss.append(value_loss.data.cpu()[0])
            all_policy_loss.append(policy_loss.data.cpu()[0])
        else:
            logger.scalar_summary('value_loss', value_loss.data[0], n)
            logger.scalar_summary('policy_loss', policy_loss.data[0], n)

            all_value_loss.append(value_loss.data[0])
            all_policy_loss.append(policy_loss.data[0])

        del policy.rewards[:]
        del policy.saved_info[:]

    if args.experiment != 'a|s':

        def train_vae(n):

            train_times = (n // args.vae_update_frequency -
                           1) * args.vae_update_times

            for i in range(args.vae_update_times):
                train_times += 1

                sample = buffer.sample(args.batch_size)
                batch = Transition(*zip(*sample))
                state_batch = torch.cat(batch.state)

                if args.experiment == 'a|z(s)':
                    recon_batch, mu, log_var = vae.forward(state_batch)

                    mse_loss, kl_loss = vae_loss_function(
                        recon_batch,
                        state_batch,
                        mu,
                        log_var,
                        logger,
                        train_times,
                        kl_discount=args.kl_weight,
                        mode=args.experiment)

                elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)':
                    next_state_batch = Variable(torch.cat(batch.next_state),
                                                requires_grad=False)
                    predicted_batch, mu, log_var = vae.forward(state_batch)
                    mse_loss, kl_loss = vae_loss_function(
                        predicted_batch,
                        next_state_batch,
                        mu,
                        log_var,
                        logger,
                        train_times,
                        kl_discount=args.kl_weight,
                        mode=args.experiment)

                vae_loss = mse_loss + kl_loss

                vae_optimizer.zero_grad()
                vae_loss.backward()
                vae_optimizer.step()

                logger.scalar_summary('vae_loss', vae_loss.data[0],
                                      train_times)
                all_vae_loss.append(vae_loss.data[0])
                all_mse_loss.append(mse_loss.data[0])
                all_kl_loss.append(kl_loss.data[0])

    # To store cum_reward, value_loss and policy_loss from each episode
    all_cum_reward = []
    all_last_hundred_average = []
    all_value_loss = []
    all_policy_loss = []

    if args.experiment != 'a|s':
        # Store each vae_loss calculated
        all_vae_loss = []
        all_mse_loss = []
        all_kl_loss = []

    for episode in count(1):
        done = False
        state_ = torch.Tensor([env.reset()])
        cum_reward = 0

        if args.experiment == 'a|z(a_prev, s, s_next)':
            action = random.randint(0, 2)
            state_, reward, done, info = env.step(action)
            cum_reward += reward
            state_ = torch.Tensor([np.append(state_, action)])

        while not done:
            if args.experiment == 'a|s':
                state = Variable(state_, requires_grad=False)
            elif args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)' \
                    or args.experiment == 'a|z(a_prev, s, s_next)':
                state_ = Variable(state_, requires_grad=False)
                mu, log_var = vae.encode(state_)

                if args.bp2VAE and update_vae:
                    state = vae.reparametrize(mu, log_var)
                else:
                    state = vae.reparametrize(mu, log_var).detach()

            action_ = policy.select_action(state)

            if args.use_cuda:
                action = action_.cpu()[0, 0]
            else:
                action = action_[0, 0]

            next_state_, reward, done, info = env.step(action)
            next_state_ = torch.Tensor([next_state_])
            cum_reward += reward

            if args.render:
                env.render()

            policy.rewards.append(reward)

            if args.experiment == 'a|z(s)':
                buffer.push(state_)
            elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)':
                if not done:
                    buffer.push(state_, next_state_)

            if args.experiment == 'a|z(a_prev, s, s_next)':
                next_state_ = torch.cat(
                    [next_state_, torch.Tensor([action])], 1)

            state_ = next_state_

        train_actor_critic(episode)
        last_hundred_average = sum(all_cum_reward[-100:]) / 100

        logger.scalar_summary('cum_reward', cum_reward, episode)
        logger.scalar_summary('last_hundred_average', last_hundred_average,
                              episode)

        all_cum_reward.append(cum_reward)
        all_last_hundred_average.append(last_hundred_average)

        if update_vae:
            if args.experiment != 'a|s' and episode % args.vae_update_frequency == 0:
                assert len(buffer) >= args.batch_size
                train_vae(episode)

            if len(all_vae_loss) > 1000:
                if abs(
                        sum(all_vae_loss[-500:]) / 500 -
                        sum(all_vae_loss[-1000:-500]) /
                        500) < args.vae_update_threshold:
                    update_vae = False

        if episode % args.log_interval == 0:
            print(
                'Episode {}\tLast cum return: {:5f}\t100-episodes average cum return: {:.2f}'
                .format(episode, cum_reward, last_hundred_average))

        if episode > args.num_episodes:
            print("100-episodes average cum return is now {} and "
                  "the last episode runs to {} time steps!".format(
                      last_hundred_average, cum_reward))
            env.close()
            torch.save(policy, '/'.join([direc_name, 'model']))

            if args.experiment == 'a|s':
                record = Record_S(
                    policy_loss=all_policy_loss,
                    value_loss=all_value_loss,
                    cum_reward=all_cum_reward,
                    last_hundred_average=all_last_hundred_average)
            elif args.experiment == 'a|z(s)':
                record = Record_S2S(
                    policy_loss=all_policy_loss,
                    value_loss=all_value_loss,
                    cum_reward=all_cum_reward,
                    last_hundred_average=all_last_hundred_average,
                    mse_recon_loss=all_mse_loss,
                    kl_loss=all_kl_loss,
                    vae_loss=all_vae_loss)
            elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)':
                record = Record_S2SNext(
                    policy_loss=all_policy_loss,
                    value_loss=all_value_loss,
                    cum_reward=all_cum_reward,
                    last_hundred_average=all_last_hundred_average,
                    mse_pred_loss=all_mse_loss,
                    kl_loss=all_kl_loss,
                    vae_loss=all_vae_loss)

            pickle.dump(record, open('/'.join([direc_name, 'record']), 'wb'))

            break