Exemplo n.º 1
0
 def thunk():
     _env = grounding_env.GroundingEnv(args,
                                       args.seed + rank,
                                       img_encoder=None,
                                       fixed=False,
                                       manual_set_task=True,
                                       n_stack=variant['n_stack'])
     _env.game_init()
     _env.tasks = _env.sample_tasks(variant['task_params']['n_tasks'],
                                    variants=variant['all_tasks'])
     return _env
    if args.evaluate == 0:
        args.use_train_instructions = 1
        log_filename = "train.log"
    elif args.evaluate == 1:
        args.use_train_instructions = 1
        args.num_processes = 0
        log_filename = "test-MT.log"
    elif args.evaluate == 2:
        args.use_train_instructions = 0
        args.num_processes = 0
        log_filename = "test-ZSL.log"
    else:
        assert False, "Invalid evaluation type"

    env = grounding_env.GroundingEnv(args)
    args.input_size = len(env.word_to_idx)

    # Setup logging
    if not os.path.exists(args.dump_location):
        os.makedirs(args.dump_location)
    logging.basicConfig(filename=args.dump_location + log_filename,
                        level=logging.INFO)

    shared_model = A3C_LSTM_GA(args)

    # Load the model
    if (args.load != "0"):
        shared_model.load_state_dict(
            torch.load(args.load, map_location=lambda storage, loc: storage))
Exemplo n.º 3
0
def train(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = grounding_env.GroundingEnv(args)
    env.game_init()

    model = A3C_LSTM_GA(args)

    if (args.load != "0"):
        print(str(rank) + " Loading model ... "+args.load)
        model.load_state_dict(
            torch.load(args.load, map_location=lambda storage, loc: storage))

    model.train()

    optimizer = optim.SGD(shared_model.parameters(), lr=args.lr)

    p_losses = []
    v_losses = []

    (image, instruction), _, _, _ = env.reset()
    instruction_idx = []
    for word in instruction.split(" "):
        instruction_idx.append(env.word_to_idx[word])
    instruction_idx = np.array(instruction_idx)

    image = torch.from_numpy(image).float()/255.0
    instruction_idx = torch.from_numpy(instruction_idx).view(1, -1)

    done = True

    episode_length = 0
    num_iters = 0
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            episode_length = 0
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))

        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            tx = Variable(torch.from_numpy(np.array([episode_length])).long())

            value, logit, (hx, cx) = model((Variable(image.unsqueeze(0)),
                                            Variable(instruction_idx),
                                            (tx, hx, cx)))
            prob = F.softmax(logit, dim=1)
            log_prob = F.log_softmax(logit, dim=1)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial(1).data
            log_prob = log_prob.gather(1, Variable(action))

            action = action.numpy()[0, 0]
            (image, _), reward, done,  _ = env.step(action)

            done = done or episode_length >= args.max_episode_length

            if done:
                (image, instruction), _, _, _ = env.reset()
                instruction_idx = []
                for word in instruction.split(" "):
                    instruction_idx.append(env.word_to_idx[word])
                instruction_idx = np.array(instruction_idx)
                instruction_idx = torch.from_numpy(
                        instruction_idx).view(1, -1)

            image = torch.from_numpy(image).float()/255.0

            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            tx = Variable(torch.from_numpy(np.array([episode_length])).long())
            value, _, _ = model((Variable(image.unsqueeze(0)),
                                 Variable(instruction_idx), (tx, hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)

        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

        optimizer.zero_grad()

        p_losses.append(policy_loss.data[0, 0])
        v_losses.append(value_loss.data[0, 0])

        if(len(p_losses) > 1000):
            num_iters += 1
            print(" ".join([
                  "Training thread: {}".format(rank),
                  "Num iters: {}K".format(num_iters),
                  "Avg policy loss: {}".format(np.mean(p_losses)),
                  "Avg value loss: {}".format(np.mean(v_losses))]))
            logging.info(" ".join([
                  "Training thread: {}".format(rank),
                  "Num iters: {}K".format(num_iters),
                  "Avg policy loss: {}".format(np.mean(p_losses)),
                  "Avg value loss: {}".format(np.mean(v_losses))]))
            p_losses = []
            v_losses = []

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = grounding_env.GroundingEnv(args)
    env.game_init()

    model = A3C_LSTM_GA(args)

    if (args.load != "0"):
        print("Loading model ... " + args.load)
        model.load_state_dict(
            torch.load(args.load, map_location=lambda storage, loc: storage))

    model.eval()

    (image, instruction), _, _, _ = env.reset()

    # Print instruction while evaluating and visualizing
    if args.evaluate != 0 and args.visualize == 1:
        print("Instruction: {} ".format(instruction))

    # Getting indices of the words in the instruction
    instruction_idx = []
    for word in instruction.split(" "):
        instruction_idx.append(env.word_to_idx[word])
    instruction_idx = np.array(instruction_idx)

    image = torch.from_numpy(image).float() / 255.0
    instruction_idx = torch.from_numpy(instruction_idx).view(1, -1)

    reward_sum = 0
    done = True

    start_time = time.time()

    episode_length = 0
    rewards_list = []
    accuracy_list = []
    episode_length_list = []
    num_episode = 0
    best_reward = 0.0
    test_freq = 50
    while True:
        episode_length += 1
        if done:
            if (args.evaluate == 0):
                model.load_state_dict(shared_model.state_dict())

            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        tx = Variable(torch.from_numpy(np.array([episode_length])).long(),
                      volatile=True)

        value, logit, (hx, cx) = model((Variable(image.unsqueeze(0),
                                                 volatile=True),
                                        Variable(instruction_idx,
                                                 volatile=True), (tx, hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        (image, _), reward, done, _ = env.step(action[0])

        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        if done:
            num_episode += 1
            rewards_list.append(reward_sum)
            # Print reward while evaluating and visualizing
            if args.evaluate != 0 and args.visualize == 1:
                print("Total reward: {}".format(reward_sum))

            episode_length_list.append(episode_length)
            if reward == CORRECT_OBJECT_REWARD:
                accuracy = 1
            else:
                accuracy = 0
            accuracy_list.append(accuracy)
            if (len(rewards_list) >= test_freq):
                print(" ".join([
                    "Time {},".format(
                        time.strftime("%Hh %Mm %Ss",
                                      time.gmtime(time.time() - start_time))),
                    "Avg Reward {},".format(np.mean(rewards_list)),
                    "Avg Accuracy {},".format(np.mean(accuracy_list)),
                    "Avg Ep length {},".format(np.mean(episode_length_list)),
                    "Best Reward {}".format(best_reward)
                ]))
                logging.info(" ".join([
                    "Time {},".format(
                        time.strftime("%Hh %Mm %Ss",
                                      time.gmtime(time.time() - start_time))),
                    "Avg Reward {},".format(np.mean(rewards_list)),
                    "Avg Accuracy {},".format(np.mean(accuracy_list)),
                    "Avg Ep length {},".format(np.mean(episode_length_list)),
                    "Best Reward {}".format(best_reward)
                ]))
                if np.mean(rewards_list) >= best_reward and args.evaluate == 0:
                    torch.save(model.state_dict(),
                               args.dump_location + "model_best")
                    best_reward = np.mean(rewards_list)

                rewards_list = []
                accuracy_list = []
                episode_length_list = []
            reward_sum = 0
            episode_length = 0
            (image, instruction), _, _, _ = env.reset()
            # Print instruction while evaluating and visualizing
            if args.evaluate != 0 and args.visualize == 1:
                print("Instruction: {} ".format(instruction))

            # Getting indices of the words in the instruction
            instruction_idx = []
            for word in instruction.split(" "):
                instruction_idx.append(env.word_to_idx[word])
            instruction_idx = np.array(instruction_idx)
            instruction_idx = torch.from_numpy(instruction_idx).view(1, -1)
Exemplo n.º 5
0
def train(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = grounding_env.GroundingEnv(args)
    env.game_init()

    model = A3C_LSTM_GA(args)

    if (args.load != "0"):
        print(str(rank) + " Loading model ... " + args.load)
        model.load_state_dict(
            torch.load(args.load, map_location=lambda storage, loc: storage))

    model.train()

    optimizer = optim.SGD(shared_model.parameters(), lr=args.lr)

    p_losses = []
    v_losses = []

    (images, instruction), _, _, _ = env.reset()

    images = torch.from_numpy(np.stack(images)).float() / 255.0
    done = True
    '''
    #Curiosity bookkeeping
    prevState = images
    prevAction = None
    beta = .2
    lamb = .2 #TODO tune this. Language grounding is important
    #eta = 1 #TODO tune this hyperparameter
    '''

    episode_length = 0
    num_iters = 0
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            episode_length = 0
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))

        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        #Optimizing over this
        policy_loss = Variable(torch.zeros(1, 1))
        value_loss = 0

        for step in range(args.num_steps):
            episode_length += 1
            tx = Variable(torch.from_numpy(np.array([episode_length])).long())

            value, logit, (hx, cx) = model(
                (Variable(images), Variable(instruction_idx), (tx, hx, cx)),
                teacher=True,
                inverse=False)
            prob = F.softmax(logit, dim=1)
            log_prob = F.log_softmax(logit, dim=1)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial(
                1).data  #action is sampled once from multinomial
            log_prob = log_prob.gather(1, Variable(action))
            oldAction = action

            action = action.numpy()[0, 0]
            (images, _), reward, done, _ = env.step(action)
            #Process entire last 5 images into cnn

            done = done or episode_length >= args.max_episode_length

            if done:
                (images, instruction), _, _, _ = env.reset()
                instruction_idx = []
                for word in instruction.split(" "):
                    instruction_idx.append(env.word_to_idx[word])
                instruction_idx = np.array(instruction_idx)
                instruction_idx = torch.from_numpy(instruction_idx).view(1, -1)

            #We stack now because we take in last 5 images.
            images = torch.from_numpy(np.stack(images)).float() / 255.0

            #curiosity loss and reward. This is done in pretraining now.
            '''
            if prevAction is not None:
                pred_action = model((Variable(prevState),
                                Variable(images)), 
                                teacher=False, inverse=True)
                a_prob = F.softmax(pred_action) 
                a_loss = 1/2 * torch.norm(a_prob - prob)
                #Because we have access to softmax, might as well use it TODO
                #actionTensor = torch.eye(3)[prevAction[0]]

                pred_state = model((Variable(prevState), prevAction[0]), teacher=False, inverse=False)
                #We are predicting final next state
                s_loss = 1/2 * torch.norm(pred_state - model.getImageRep(Variable(images[-1].unsqueeze(0))))
                policy_loss += (1-beta) * a_loss + beta * s_loss
                #curReward += eta * s_loss.item()

            #Updating curiosity
            prevAction = oldAction
            prevState = images
            '''

            values.append(value)  #critic in actor-critic
            log_probs.append(log_prob)
            rewards.append(
                reward)  #+2 if found, -.1 if not found, plus intrinsic

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            tx = Variable(torch.from_numpy(np.array([episode_length])).long())
            value, _, _ = model(
                (Variable(images), Variable(instruction_idx), (tx, hx, cx)),
                teacher=True,
                inverse=False)
            R = value.data

        values.append(Variable(R))
        R = Variable(R)

        new_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            new_loss = new_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

        policy_loss += lamb * new_loss
        optimizer.zero_grad()

        p_losses.append(policy_loss.data[0, 0])
        v_losses.append(value_loss.data[0, 0])

        if (len(p_losses) > 1000):
            num_iters += 1
            print(" ".join([
                "Training thread: {}".format(rank),
                "Num iters: {}K".format(num_iters),
                "Avg policy loss: {}".format(np.mean(p_losses)),
                "Avg value loss: {}".format(np.mean(v_losses))
            ]))
            logging.info(" ".join([
                "Training thread: {}".format(rank),
                "Num iters: {}K".format(num_iters),
                "Avg policy loss: {}".format(np.mean(p_losses)),
                "Avg value loss: {}".format(np.mean(v_losses))
            ]))
            p_losses = []
            v_losses = []

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()