예제 #1
0
def test(rank):
    act_model_kwargs = {'question_vocab': load_vocab(args.vocab_json)}
    act_model = actPlannerBaseModel(**act_model_kwargs)
    act_checkpoint = torch.load(args.nav_weight)  #load checkpoint weights
    act_model.load_state_dict(act_checkpoint['state'])  #create model
    print('--- act_model loaded checkpoint ---')

    res_model_dir = os.path.abspath("../train/models/resnet101.pth")
    my_map_cnn = mapCNN(checkpoint_path=res_model_dir)
    map_checkpoint = torch.load('mapcnn.pt',
                                map_location='cpu')  #load checkpoint weights
    my_map_cnn.load_state_dict(map_checkpoint['state'])  #create map model
    print('--- map_model loaded checkpoint ---')

    cnn_model_dir = os.path.abspath("../train/models/03_13_h3d_hybrid_cnn.pt")
    cnn_kwargs = {
        'num_classes': 191,
        'pretrained': True,
        'checkpoint_path': cnn_model_dir
    }
    cnn = MultitaskCNN(**cnn_kwargs)
    cnn.eval()

    vocab_dir = os.path.abspath("vocab.json")
    vocab_file = open(vocab_dir, 'r', encoding='utf-8')
    vocab = json.load(vocab_file)

    planner_hidden = None
    max_action = 30
    position = [0, 0]
    action_in_raw = [0]  #start action_in
    actions = []
    question = args.question
    print(question)
    questionTokens = tokenize(question,
                              punctToRemove=['?'],
                              addStartToken=False)

    encoded_question_raw = encode(questionTokens, vocab['questionTokenToIdx'])
    while (len(encoded_question_raw) < 10):
        encoded_question_raw.append(0)  #encode question
    encoded_question_raw = np.array(encoded_question_raw)
    encoded_question_tensor = _dataset_to_tensor(encoded_question_raw)
    encoded_question = Variable(encoded_question_tensor)
    encoded_question = encoded_question.unsqueeze(0)
    #print(encoded_question)
    action_times = 0
    push_signal = 0
    push_point = 0

    crop_w_offset = 470
    crop_w = 840
    crop_h_offset = 280
    crop_h = 690

    rgb_before = cv.imread(args.rgb_image_before_dir)
    depth_before = cv.imread(args.depth_image_before_dir)
    rgb_before_crop = rgb_before[crop_h_offset:crop_h_offset + crop_h,
                                 crop_w_offset:crop_w_offset + crop_w]
    depth_before_crop = depth_before[crop_h_offset:crop_h_offset + crop_h,
                                     crop_w_offset:crop_w_offset + crop_w]
    depth_before_crop = depth_before_crop[0]
    cv.imwrite(args.rgb_crop_before_dir_, rgb_before_crop)

    rgb_dim = rgb_before.shape
    rgb_crop_dim = rgb_before_crop.shape
    # print(depth_dim)
    # print(depth_crop_dim)

    depth_before = depth_before[0]
    rgb_before_resize = cv.resize(rgb_before_crop, (256, 256),
                                  interpolation=cv.INTER_AREA)
    depth_before_resize = cv.resize(depth_before_crop, (256, 256),
                                    interpolation=cv.INTER_AREA)
    '''
    print(depth_np.max())

    print(depth_np[0][5])
    print('.....')
    print(depth_np[1][5])
    print('.....')
    print(depth_np[2][5])
    print('.....')
    '''

    rgb_tensor, depth_tensor = rgbd2tensor(
        rgb_before_resize, depth_before_resize)  #output_heatmap
    heatmap_output = rgbd2heatmap(rgb_tensor, depth_tensor, my_map_cnn)
    f = h5py.File(args.heatmap_output_dir, 'w')
    f['heatmap'] = heatmap_output

    cv.imwrite(args.rgb_image_before_dir_, rgb_before_resize)
    cv.imwrite(args.depth_image_before_dir_, depth_before_resize)

    while (action_times < max_action):

        #print(planner_img_feats_var.size())
        action_in_tensor = _dataset_to_tensor(action_in_raw)
        action_in = Variable(action_in_tensor)
        action_in = action_in.unsqueeze(0)
        action_in = action_in.unsqueeze(0)

        position_in, planner_img_feats_var = data2input(
            position, rgb_before_resize, cnn)

        output_data, planner_hidden = act_model.planner_step(
            encoded_question, planner_img_feats_var, action_in, position_in,
            planner_hidden)
        planner_possi = F.log_softmax(output_data, dim=1)
        planner_data = planner_possi.data.numpy()
        planner_data = planner_data[0]
        action_out = np.where(planner_data == np.max(planner_data))
        action_out = action_out[0][0]

        actions.append(action_out)
        action_in_raw = [action_out]
        if action_out == 9:
            #print('stop')
            break
        elif action_out == 0:
            push_signal = 1
            push_point = action_times
        else:
            dx, dy = order2action(action_out)
            position[0] += dx
            position[1] += dy
        action_times += 1

    if len(actions) > 2 and push_signal == 0:
        #action_position = position+position
        print('\n -- suck in {} '.format(position))

        # convert to correct position
        # crop_position = (int(position[0] / 256 * rgb_dim[1] - crop_w_offset),
        #                 int(position[1] / 256 * rgb_dim[0] - crop_h_offset))
        crop_position = (int(position[0] / 256 * rgb_crop_dim[1]),
                         int(position[1] / 256 * rgb_crop_dim[0]))

        # draw a red cross at position on cropped rgb
        rgb_before_crop = cv.drawMarker(rgb_before_crop,
                                        crop_position, (0, 0, 255),
                                        markerType=cv.MARKER_CROSS,
                                        markerSize=50,
                                        thickness=5,
                                        line_type=cv.LINE_AA)
        cv.imwrite(args.rgb_crop_before_dir_, rgb_before_crop)

        # draw the same on rgb_before_resize for comparison
        # convert to correct position
        crop_position_ = (int(position[0]), int(position[1]))
        # draw a red cross at position on cropped rgb
        rgb_before_resize = cv.drawMarker(rgb_before_resize,
                                          crop_position_, (0, 0, 255),
                                          markerType=cv.MARKER_CROSS,
                                          markerSize=10,
                                          thickness=5,
                                          line_type=cv.LINE_AA)
        cv.imwrite(args.rgb_image_before_dir_, rgb_before_resize)

    elif len(actions) > 2 and push_signal == 1:  #pushing
        position_start = [0, 0]
        position_end = [0, 0]
        for i in range(len(actions)):
            if i <= push_point:  #the first step
                dx, dy = order2action(actions[i])
                position_start[0] += dx
                position_start[1] += dy
                position_end[0] += dx
                position_end[1] += dy
            else:  #the second step
                dx, dy = order2action(actions[i])
                position_end[0] += dx
                position_end[1] += dy
        #action_position = position_start+position_end
        print('\n -- Push from {} to {}'.format(position_start, position_end))

        # convert to correct position
        # crop_position_start = (int(position_start[0] / 256 * rgb_dim[1] - crop_w_offset),
        #                         int(position_start[1] / 256 * rgb_dim[0] - crop_h_offset))
        # crop_position_end = (int(position_end[0] / 256 * rgb_dim[1] - crop_w_offset),
        #                         int(position_end[1] / 256 * rgb_dim[0] - crop_h_offset))
        crop_position_start = (int(position_start[0] / 256 * rgb_crop_dim[1]),
                               int(position_start[1] / 256 * rgb_crop_dim[0]))
        crop_position_end = (int(position_end[0] / 256 * rgb_crop_dim[1]),
                             int(position_end[1] / 256 * rgb_crop_dim[0]))

        # draw a red, 10pt arrow from position_start to position_end on cropped rgb
        rgb_before_crop = cv.arrowedLine(rgb_before_crop,
                                         crop_position_start,
                                         crop_position_end, (0, 0, 255),
                                         thickness=3,
                                         line_type=cv.LINE_AA)
        cv.imwrite(args.rgb_crop_before_dir_, rgb_before_crop)

        # draw the same on rgb_before_resize for comparison
        # convert to correct position
        crop_position_start_ = (int(position_start[0]), int(position_start[1]))
        crop_position_end_ = (int(position_end[0]), int(position_end[1]))

        # draw a red cross at position on cropped rgb
        rgb_before_resize = cv.arrowedLine(rgb_before_resize,
                                           crop_position_start_,
                                           crop_position_end_, (0, 0, 255),
                                           thickness=3,
                                           line_type=cv.LINE_AA)
        cv.imwrite(args.rgb_image_before_dir_, rgb_before_resize)

    else:
        print('\n -- No action')
예제 #2
0
def test(rank):
    nav_model_kwargs = {'question_vocab': load_vocab(args.vocab_json)}
    nav_model = NavPlannerControllerModel(**nav_model_kwargs)
    nav_checkpoint = torch.load(args.nav_weight)  #load checkpoint weights
    nav_model.load_state_dict(nav_checkpoint['state'])  #create model
    print('--- nav_model loaded checkpoint ---')

    cnn_kwargs = {'num_classes': 191, 'pretrained': True}
    cnn = MultitaskCNN(**cnn_kwargs)
    cnn.eval()
    cnn.cuda()  #create cnn model

    vqa_model_kwargs = {'vocab': load_vocab(args.vocab_json)}
    vqa_model = VqaLstmCnnAttentionModel(**vqa_model_kwargs)
    vqa_checkpoint = torch.load(args.vqa_weight)  #load checkpoint weights
    vqa_model.load_state_dict(vqa_checkpoint['state'])
    print('--- vqa_model loaded checkpoint ---')

    # need cnn?

    scene = "test-10-obj-100.txt"
    my_env = enviroment.Environment(is_testing=0, testing_file=scene)
    object_exist_list = my_env.ur5.object_type
    print("Objetcts that exist: ")
    print(object_exist_list)  #create simulation enviroment

    my_question = Qusetion(object_exist_list)  #create testing question
    testing_questions = my_question.createQueue()
    vocab = my_question.create_vocab()

    for question in testing_questions:
        planner_hidden = None
        max_action = 30
        position = [0, 0]
        action_in_raw = [0]  #start action_in
        actions = []

        print(question['question'])  #question
        questionTokens = my_question.tokenize(question['question'],
                                              punctToRemove=['?'],
                                              addStartToken=False)
        encoded_question_raw = my_question.encode(questionTokens,
                                                  vocab['questionTokenToIdx'])
        encoded_question_raw.append(0)  #encode question
        encoded_question_raw = np.array(encoded_question_raw)
        encoded_question_tensor = _dataset_to_tensor(encoded_question_raw)
        encoded_question = Variable(encoded_question_tensor)
        encoded_question = encoded_question.unsqueeze(0)
        print(encoded_question)
        action_times = 0
        push_signal = 0
        push_point = 0

        while (action_times < max_action):

            #print(planner_img_feats_var.size())
            action_in_tensor = _dataset_to_tensor(action_in_raw)
            action_in = Variable(action_in_tensor)
            action_in = action_in.unsqueeze(0)
            action_in = action_in.unsqueeze(0)

            _, rgb_image_raw = my_env.camera.get_camera_data()  #before
            position_in, planner_img_feats_var = data2input(
                position, rgb_image_raw, cnn)

            output_data, planner_hidden = nav_model.planner_step(
                encoded_question, planner_img_feats_var, action_in,
                position_in, planner_hidden)
            planner_possi = F.log_softmax(output_data, dim=1)
            planner_data = planner_possi.data.numpy()
            planner_data = planner_data[0]
            action_out = np.where(planner_data == np.max(planner_data))
            action_out = action_out[0][0]

            actions.append(action_out)
            action_in_raw = [action_out]
            if action_out == 9:
                print('stop')
                break
            elif action_out == 0:
                push_signal = 1
                push_point = action_times
            else:
                dx, dy = order2action(action_out)
                position[0] += dx
                position[1] += dy
            action_times += 1

        if len(actions) > 2 and push_signal == 0:
            action_position = position + position
            my_env.UR5_action(action_position, 2)  #sucking
        elif len(actions) > 2 and push_signal == 1:  #pushing
            position_start = [0, 0]
            position_end = [0, 0]
            for i in range(len(actions)):
                if i <= push_point:  #the first step
                    dx, dy = order2action(actions[i])
                    position_start[0] += dx
                    position_start[1] += dy
                    position_end[0] += dx
                    position_end[1] += dy
                else:  #the second step
                    dx, dy = order2action(actions[i])
                    position_end[0] += dx
                    position_end[1] += dy
            action_position = position_start + position_end
            my_env.UR5_action(action_position, 1)  #pushing

        # get image after actions
        _, rgb_image_after = my_env.camera.get_camera_data(
        )  # image after actions
        shrink = cv.resize(rgb_image_raw, (224, 224),
                           interpolation=cv.INTER_AREA)
        shrink = np.array(shrink)
        shrink = shrink.transpose((2, 0, 1))
        shrink = shrink.reshape(1, 3, 224, 224)
        shrink = (shrink / 255.0).astype(np.float32)
        images = torch.FloatTensor(shrink)
        images = Variable(images)
        images = images.unsqueeze(0)

        # process images

        # answer question in vqa now
        # encoded_question already done

        scores, _ = vqa_model(images, encoded_question)
        scores = scores.data.numpy()
        scores = scores[0]
        answer_predict = np.where(scores == np.max(scores))
        answer_predict = answer_predict[0][0]
        if answer_predict == 0:
            print('--- Predict: Exists not')
        elif answer_predict == 1:
            print('--- Predict: Exists')
        else:
            raise Exception('Prediction neither 0 nor 1')
예제 #3
0
파일: data.py 프로젝트: wooridle/EmbodiedQA
class EqaDataset(Dataset):
    def __init__(self,
                 questions_h5,
                 vocab,
                 num_frames=1,
                 data_json=False,
                 split='train',
                 gpu_id=0,
                 input_type='ques',
                 max_threads_per_gpu=10,
                 to_cache=False,
                 target_obj_conn_map_dir=False,
                 map_resolution=1000):
        self.questions_h5 = questions_h5
        self.vocab = load_vocab(vocab)
        self.num_frames = num_frames

        np.random.seed()

        self.data_json = data_json
        self.split = split
        self.gpu_id = gpu_id

        self.input_type = input_type

        self.max_threads_per_gpu = max_threads_per_gpu

        self.target_obj_conn_map_dir = target_obj_conn_map_dir
        self.map_resolution = map_resolution

        self.to_cache = to_cache
        self.img_data_cache = {}

        if self.data_json != False:
            data = json.load(open(self.data_json, 'r'))
            self.envs = data['envs']

            self.env_idx = data[self.split + '_env_idx']
            self.env_list = [self.envs[x] for x in self.env_idx]
            self.env_set = list(set(self.env_list))
            self.env_set.sort()

            print('Total envs: %d' % len(list(set(self.envs))))
            print('Envs in %s: %d' % (self.split,
                                      len(list(set(self.env_idx)))))

            if input_type != 'ques':
                ''''
                If training, randomly sample and load a subset of environments,
                train on those, and then cycle through to load the rest.

                On the validation and test set, load in order, and cycle through.

                For both, add optional caching so that if all environments
                have been cycled through once, then no need to re-load and
                instead, just the cache can be used.
                '''

                self.api_threads = []
                self._load_envs(start_idx=0, in_order=True)

                cnn_kwargs = {'num_classes': 191, 'pretrained': True}
                self.cnn = MultitaskCNN(**cnn_kwargs)
                self.cnn.eval()
                self.cnn.cuda()

            self.pos_queue = data[self.split + '_pos_queue']
            self.boxes = data[self.split + '_boxes']

        print('Reading question data into memory')
        self.idx = _dataset_to_tensor(questions_h5['idx'])
        self.questions = _dataset_to_tensor(questions_h5['questions'])
        self.answers = _dataset_to_tensor(questions_h5['answers'])
        self.actions = _dataset_to_tensor(questions_h5['action_labels'])
        self.action_lengths = _dataset_to_tensor(
            questions_h5['action_lengths'])

        if input_type == 'pacman':

            self.planner_actions = self.actions.clone().fill_(0)
            self.controller_actions = self.actions.clone().fill_(-1)

            self.planner_action_lengths = self.action_lengths.clone().fill_(0)
            self.controller_action_lengths = self.action_lengths.clone().fill_(
                0)

            self.planner_hidden_idx = self.actions.clone().fill_(0)

            self.planner_pos_queue_idx, self.controller_pos_queue_idx = [], []

            # parsing flat actions to planner-controller hierarchy
            for i in tqdm(range(len(self.actions))):

                pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(self.actions[i][:self.action_lengths[i]+1])

                self.planner_actions[i][:len(pa)] = torch.Tensor(pa)
                self.controller_actions[i][:len(ca)] = torch.Tensor(ca)

                self.planner_action_lengths[i] = len(pa)-1
                self.controller_action_lengths[i] = len(ca)

                self.planner_pos_queue_idx.append(pq_idx)
                self.controller_pos_queue_idx.append(cq_idx)

                self.planner_hidden_idx[i][:len(ca)] = torch.Tensor(ph_idx)

    def _pick_envs_to_load(self,
                           split='train',
                           max_envs=10,
                           start_idx=0,
                           in_order=False):
        if split in ['val', 'test'] or in_order == True:
            pruned_env_set = self.env_set[start_idx:start_idx + max_envs]
        else:
            if max_envs < len(self.env_set):
                env_inds = np.random.choice(
                    len(self.env_set), max_envs, replace=False)
            else:
                env_inds = np.random.choice(
                    len(self.env_set), max_envs, replace=True)
            pruned_env_set = [self.env_set[x] for x in env_inds]
        return pruned_env_set

    def _load_envs(self, start_idx=-1, in_order=False):
        if start_idx == -1:
            start_idx = self.env_set.index(self.pruned_env_set[-1]) + 1

        # Pick envs
        self.pruned_env_set = self._pick_envs_to_load(
            split=self.split,
            max_envs=self.max_threads_per_gpu,
            start_idx=start_idx,
            in_order=in_order)

        if len(self.pruned_env_set) == 0:
            return

        # Load api threads
        start = time.time()
        if len(self.api_threads) == 0:
            for i in range(len(self.pruned_env_set)):
                self.api_threads.append(
                    objrender.RenderAPIThread(
                        w=224, h=224, device=self.gpu_id))

        self.cfg = load_config('../House3D/tests/config.json')

        print('[%.02f] Loaded %d api threads' % (time.time() - start,
                                                 len(self.api_threads)))
        start = time.time()

        # Load houses
        from multiprocessing import Pool
        _args = ([h, self.cfg, self.map_resolution]
                 for h in self.pruned_env_set)
        with Pool(len(self.pruned_env_set)) as pool:
            self.all_houses = pool.starmap(local_create_house, _args)

        print('[%.02f] Loaded %d houses' % (time.time() - start,
                                            len(self.all_houses)))
        start = time.time()

        # Load envs
        self.env_loaded = {}
        for i in range(len(self.all_houses)):
            print('[%02d/%d][split:%s][gpu:%d][house:%s]' %
                  (i + 1, len(self.all_houses), self.split, self.gpu_id,
                   self.all_houses[i].house['id']))
            self.env_loaded[self.all_houses[i].house['id']] = House3DUtils(
                Environment(self.api_threads[i], self.all_houses[i], self.cfg),
                target_obj_conn_map_dir=self.target_obj_conn_map_dir,
                build_graph=False)

        # [TODO] Unused till now
        self.env_ptr = -1

        print('[%.02f] Loaded %d house3d envs' % (time.time() - start,
                                                  len(self.env_loaded)))

        # Mark available data indices
        self.available_idx = [
            i for i, v in enumerate(self.env_list) if v in self.env_loaded
        ]
        print('Available inds: %d' % len(self.available_idx))

        # Flag to check if loaded envs have been cycled through or not
        # [TODO] Unused till now
        self.all_envs_loaded = False

    def _clear_api_threads(self):
        for i in range(len(self.api_threads)):
            del self.api_threads[0]
        self.api_threads = []

    def _check_if_all_envs_loaded(self):
        print('[CHECK][Cache:%d][Total:%d]' % (len(self.img_data_cache),
                                               len(self.env_list)))
        if len(self.img_data_cache) == len(self.env_list):
            self.available_idx = [i for i, v in enumerate(self.env_list)]
            return True
        else:
            return False

    def set_camera(self, e, pos, robot_height=1.0):
        assert len(pos) == 4

        e.env.cam.pos.x = pos[0]
        e.env.cam.pos.y = robot_height
        e.env.cam.pos.z = pos[2]
        e.env.cam.yaw = pos[3]

        e.env.cam.updateDirection()

    def render(self, e):
        return e.env.render()

    def get_frames(self, e, pos_queue, preprocess=True):
        if isinstance(pos_queue, list) == False:
            pos_queue = [pos_queue]

        res = []
        for i in range(len(pos_queue)):
            self.set_camera(e, pos_queue[i])
            img = np.array(self.render(e), copy=False, dtype=np.float32)

            if preprocess == True:
                img = img.transpose(2, 0, 1)
                img = img / 255.0

            res.append(img)

        return np.array(res)

    def get_hierarchical_features_till_spawn(self, actions, backtrack_steps=0):

        action_length = len(actions)-1
        pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(actions)

        target_pos_idx = action_length - backtrack_steps

        controller_step = True
        if target_pos_idx in pq_idx:
            controller_step = False

        pq_idx_pruned = [v for v in pq_idx if v <= target_pos_idx]
        pa_pruned = pa[:len(pq_idx_pruned)+1]

        images = self.get_frames(
            self.episode_house,
            self.episode_pos_queue,
            preprocess=True)
        raw_img_feats = self.cnn(
            Variable(torch.FloatTensor(images)
                     .cuda())).data.cpu().numpy().copy()

        controller_img_feat, controller_action_in = False, False
        if controller_step == True:
            controller_img_feat = torch.from_numpy(raw_img_feats[target_pos_idx].copy())
            controller_action_in = pa_pruned[-1] - 2

        planner_img_feats = torch.from_numpy(raw_img_feats[pq_idx_pruned].copy())
        planner_actions_in = torch.from_numpy(np.array(pa_pruned[:-1]) - 1)

        return planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feat, self.episode_pos_queue[target_pos_idx]


    def __getitem__(self, index):
        # [VQA] question-only
        if self.input_type == 'ques':
            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            return (idx, question, answer)

        # [VQA] question+image
        elif self.input_type == 'ques,image':
            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            actions_in = actions[action_length - self.num_frames:action_length]
            actions_out = actions[action_length - self.num_frames + 1:
                                  action_length + 1]

            if self.to_cache == True and index in self.img_data_cache:
                images = self.img_data_cache[index]
            else:
                pos_queue = self.pos_queue[index][
                    -self.num_frames:]  # last 5 frames
                images = self.get_frames(
                    self.env_loaded[self.env_list[index]],
                    pos_queue,
                    preprocess=True)
                if self.to_cache == True:
                    self.img_data_cache[index] = images.copy()

            return (idx, question, answer, images, actions_in, actions_out,
                    action_length)

        # [NAV] question+cnn
        elif self.input_type in ['cnn', 'cnn+q']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            if self.to_cache == True and index in self.img_data_cache:
                img_feats = self.img_data_cache[index]
            else:
                pos_queue = self.pos_queue[index]
                images = self.get_frames(
                    self.env_loaded[self.env_list[index]],
                    pos_queue,
                    preprocess=True)
                img_feats = self.cnn(
                    Variable(torch.FloatTensor(images)
                             .cuda())).data.cpu().numpy().copy()
                if self.to_cache == True:
                    self.img_data_cache[index] = img_feats

            # for val or test (evaluation), or
            # when target_obj_conn_map_dir is defined (reinforce),
            # load entire shortest path navigation trajectory
            # and load connectivity map for intermediate rewards
            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[self.env_list[index]].objects[
                        obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[self.env_list[
                        index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[self.env_list[index]].objects[
                        target_obj_id], target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[self.env_list[
                    index]].objects[target_obj_id]

                actions_in = actions[:action_length]
                actions_out = actions[1:action_length + 1] - 2

                return (idx, question, answer, img_feats, actions_in,
                        actions_out, action_length)

            # if action_length is n
            # images.shape[0] is also n
            # actions[0] is <START>
            # actions[n] is <END>

            # grab 5 random frames
            # [NOTE]: this'll break for longer-than-5 navigation sequences
            start_idx = np.random.choice(img_feats.shape[0] + 1 -
                                         self.num_frames)
            img_feats = img_feats[start_idx:start_idx + self.num_frames]

            actions_in = actions[start_idx:start_idx + self.num_frames]
            actions_out = actions[start_idx + self.num_frames] - 2

            return (idx, question, answer, img_feats, actions_in, actions_out,
                    action_length)

        # [NAV] question+lstm
        elif self.input_type in ['lstm', 'lstm+q']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            if self.split == 'train':
                if self.to_cache == True and index in self.img_data_cache:
                    img_feats = self.img_data_cache[index]
                else:
                    pos_queue = self.pos_queue[index]
                    images = self.get_frames(
                        self.env_loaded[self.env_list[index]],
                        pos_queue,
                        preprocess=True)
                    raw_img_feats = self.cnn(
                        Variable(torch.FloatTensor(images)
                                 .cuda())).data.cpu().numpy().copy()
                    img_feats = np.zeros(
                        (self.actions.shape[1], raw_img_feats.shape[1]),
                        dtype=np.float32)
                    img_feats[:raw_img_feats.shape[
                        0], :] = raw_img_feats.copy()
                    if self.to_cache == True:
                        self.img_data_cache[index] = img_feats

            actions_in = actions.clone() - 1
            actions_out = actions[1:].clone() - 2

            actions_in[action_length:].fill_(0)
            mask = actions_out.clone().gt(-1)
            if len(actions_out) > action_length:
                actions_out[action_length:].fill_(0)

            # for val or test (evaluation), or
            # when target_obj_conn_map_dir is defined (reinforce),
            # load entire shortest path navigation trajectory
            # and load connectivity map for intermediate rewards
            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[self.env_list[index]].objects[
                        obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[self.env_list[
                        index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[self.env_list[index]].objects[
                        target_obj_id], target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[self.env_list[
                    index]].objects[target_obj_id]

                return (idx, question, answer, False, actions_in, actions_out,
                        action_length, mask)

            return (idx, question, answer, img_feats, actions_in, actions_out,
                    action_length, mask)

        # [NAV] planner-controller
        elif self.input_type in ['pacman']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            planner_actions = self.planner_actions[index]
            controller_actions = self.controller_actions[index]

            planner_action_length = self.planner_action_lengths[index]
            controller_action_length = self.controller_action_lengths[index]

            planner_hidden_idx = self.planner_hidden_idx[index]

            if self.split == 'train':
                if self.to_cache == True and index in self.img_data_cache:
                    img_feats = self.img_data_cache[index]
                else:
                    pos_queue = self.pos_queue[index]
                    images = self.get_frames(
                        self.env_loaded[self.env_list[index]],
                        pos_queue,
                        preprocess=True)
                    raw_img_feats = self.cnn(
                        Variable(torch.FloatTensor(images)
                                 .cuda())).data.cpu().numpy().copy()
                    img_feats = np.zeros(
                        (self.actions.shape[1], raw_img_feats.shape[1]),
                        dtype=np.float32)
                    img_feats[:raw_img_feats.shape[
                        0], :] = raw_img_feats.copy()
                    if self.to_cache == True:
                        self.img_data_cache[index] = img_feats

            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[self.env_list[index]].objects[
                        obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[self.env_list[
                        index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[self.env_list[index]].objects[
                        target_obj_id], target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[self.env_list[
                    index]].objects[target_obj_id]

                return (idx, question, answer, actions, action_length)

            planner_pos_queue_idx = self.planner_pos_queue_idx[index]
            controller_pos_queue_idx = self.controller_pos_queue_idx[index]

            planner_img_feats = np.zeros(
                (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32)
            planner_img_feats[:planner_action_length] = img_feats[
                planner_pos_queue_idx]

            planner_actions_in = planner_actions.clone() - 1
            planner_actions_out = planner_actions[1:].clone() - 2

            planner_actions_in[planner_action_length:].fill_(0)
            planner_mask = planner_actions_out.clone().gt(-1)
            if len(planner_actions_out) > planner_action_length:
                planner_actions_out[planner_action_length:].fill_(0)

            controller_img_feats = np.zeros(
                (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32)
            controller_img_feats[:controller_action_length] = img_feats[
                controller_pos_queue_idx]

            controller_actions_in = actions[1:].clone() - 2
            if len(controller_actions_in) > controller_action_length:
                controller_actions_in[controller_action_length:].fill_(0)

            controller_out = controller_actions
            controller_mask = controller_out.clone().gt(-1)
            if len(controller_out) > controller_action_length:
                controller_out[controller_action_length:].fill_(0)

            return (idx, question, answer, planner_img_feats,
                    planner_actions_in, planner_actions_out,
                    planner_action_length, planner_mask, controller_img_feats,
                    controller_actions_in, planner_hidden_idx, controller_out,
                    controller_action_length, controller_mask)

    def __len__(self):
        if self.input_type == 'ques':
            return len(self.questions)
        else:
            return len(self.available_idx)
예제 #4
0
class EqaDataset(Dataset):
    def __init__(self,
                 questions_h5,
                 vocab,
                 num_frames=1,
                 data_json=False,
                 split='train',
                 gpu_id=0,
                 input_type='ques',
                 max_threads_per_gpu=10,
                 to_cache=False,
                 target_obj_conn_map_dir=False,
                 map_resolution=1000,
                 overfit=False,
                 max_controller_actions=5,
                 max_actions=None):

        self.questions_h5 = questions_h5
        self.vocab = load_vocab(vocab)
        self.num_frames = num_frames
        self.max_controller_actions = max_controller_actions

        np.random.seed()

        self.data_json = data_json
        self.split = split
        self.gpu_id = gpu_id

        self.input_type = input_type

        self.max_threads_per_gpu = max_threads_per_gpu

        self.target_obj_conn_map_dir = target_obj_conn_map_dir
        self.map_resolution = map_resolution
        self.overfit = overfit

        self.to_cache = to_cache
        self.img_data_cache = {}

        print('Reading question data into memory from', questions_h5)
        self.idx = _dataset_to_tensor(questions_h5['idx'])
        self.questions = _dataset_to_tensor(questions_h5['questions'])
        self.answers = _dataset_to_tensor(questions_h5['answers'])
        self.actions = _dataset_to_tensor(questions_h5['action_labels'])
        self.action_lengths = _dataset_to_tensor(
            questions_h5['action_lengths'])
        print('... finished running dataset_to_tensor operations from',
              questions_h5)

        if max_actions:  #max actions will allow us to create arrays of a certain length.  Helpful if you only want to train with 10 actions.
            print('... entering max_actions conditions block from',
                  questions_h5)
            assert isinstance(max_actions, int)
            num_data_items = self.actions.shape[0]
            new_actions = np.zeros((num_data_items, max_actions + 2),
                                   dtype=np.int64)
            new_lengths = np.ones(
                (num_data_items, ), dtype=np.int64) * max_actions
            for i in range(num_data_items):
                action_length = int(self.action_lengths[i])
                new_actions[i, 0] = 1
                new_actions[i, 1:max_actions + 1] = self.actions[
                    i, action_length - max_actions:action_length].numpy()
            self.actions = torch.LongTensor(new_actions)
            self.action_lengths = torch.LongTensor(new_lengths)

            print('... finished running max_actions conditions block from',
                  questions_h5)

        if self.data_json != False:
            print('... entering data_json false condition block from',
                  questions_h5)
            data = json.load(open(self.data_json, 'r'))
            self.envs = data['envs']

            self.env_idx = data[self.split + '_env_idx']
            self.env_list = [self.envs[x] for x in self.env_idx]
            self.env_set = list(set(self.env_list))
            self.env_set.sort()

            if self.overfit:
                self.env_idx = self.env_idx[:1]
                self.env_set = self.env_list = [
                    self.envs[x] for x in self.env_idx
                ]
                print('Trying to overfit to [house %s]' % self.env_set[0])
                logging.info('Trying to overfit to [house {}]'.format(
                    self.env_set[0]))

            print(questions_h5, 'Total envs: %d' % len(list(set(self.envs))))
            print(
                questions_h5,
                'Envs in %s: %d' % (self.split, len(list(set(self.env_idx)))))

            if input_type != 'ques':
                ''''
                If training, randomly sample and load a subset of environments,
                train on those, and then cycle through to load the rest.

                On the validation and test set, load in order, and cycle through.

                For both, add optional caching so that if all environments
                have been cycled through once, then no need to re-load and
                instead, just the cache can be used.
                '''

                self.api_threads = []
                self._load_envs(start_idx=0, in_order=True)

                cnn_kwargs = {'num_classes': 191, 'pretrained': True}
                self.cnn = MultitaskCNN(**cnn_kwargs)
                self.cnn.eval()
                self.cnn.cuda()

            self.pos_queue = data[self.split + '_pos_queue']
            self.boxes = data[self.split + '_boxes']

            if max_actions:
                for i in range(len(self.pos_queue)):
                    self.pos_queue[i] = self.pos_queue[i][-1 * max_actions:]

            print('... finished running data_json false condition block from',
                  questions_h5)

        if input_type == 'pacman':
            print('... entering input_type pacman condition block from',
                  questions_h5)

            self.planner_actions = self.actions.clone().fill_(0)
            self.controller_actions = self.actions.clone().fill_(-1)

            self.planner_action_lengths = self.action_lengths.clone().fill_(0)
            self.controller_action_lengths = self.action_lengths.clone().fill_(
                0)

            self.planner_hidden_idx = self.actions.clone().fill_(0)

            self.planner_pos_queue_idx, self.controller_pos_queue_idx = [], []

            # parsing flat actions to planner-controller hierarchy
            for i in tqdm(range(len(self.actions))):

                pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(
                    actions=self.actions[i][:self.action_lengths[i] + 1],
                    controller_action_lim=max_controller_actions)

                self.planner_actions[i][:len(pa)] = torch.Tensor(pa)
                self.controller_actions[i][:len(ca)] = torch.Tensor(ca)

                self.planner_action_lengths[i] = len(pa) - 1
                self.controller_action_lengths[i] = len(ca)

                self.planner_pos_queue_idx.append(pq_idx)
                self.controller_pos_queue_idx.append(cq_idx)

                self.planner_hidden_idx[i][:len(ca)] = torch.Tensor(ph_idx)

            print(
                '... finished running input_type pacman condition block from',
                questions_h5)

        print('... finished instantiating EqaDataset from', questions_h5)

    def _pick_envs_to_load(self,
                           split='train',
                           max_envs=10,
                           start_idx=0,
                           in_order=False):
        if split in ['val', 'test'] or in_order == True:
            pruned_env_set = self.env_set[start_idx:start_idx + max_envs]
        else:
            if max_envs < len(self.env_set):
                env_inds = np.random.choice(len(self.env_set),
                                            max_envs,
                                            replace=False)
            else:
                env_inds = np.random.choice(len(self.env_set),
                                            max_envs,
                                            replace=True)
            pruned_env_set = [self.env_set[x] for x in env_inds]
        return pruned_env_set

    def _load_envs(self, start_idx=-1, in_order=False):
        #self._clear_memory()
        if start_idx == -1:
            start_idx = self.env_set.index(self.pruned_env_set[-1]) + 1

        # Pick envs
        self.pruned_env_set = self._pick_envs_to_load(
            split=self.split,
            max_envs=self.max_threads_per_gpu,
            start_idx=start_idx,
            in_order=in_order)

        if len(self.pruned_env_set) == 0:
            return

        # Load api threads
        start = time.time()
        if len(self.api_threads) == 0:
            for i in range(self.max_threads_per_gpu):
                self.api_threads.append(
                    objrender.RenderAPIThread(w=224, h=224,
                                              device=self.gpu_id))

        self.cfg = load_config('../../House3D/tests/config.json')

        print('[%.02f] Loaded %d api threads' %
              (time.time() - start, len(self.api_threads)))
        start = time.time()

        # Load houses
        from multiprocessing import Pool
        _args = ([h, self.cfg, self.map_resolution]
                 for h in self.pruned_env_set)
        with Pool(len(self.pruned_env_set)) as pool:
            self.all_houses = pool.starmap(local_create_house, _args)

        print('[%.02f] Loaded %d houses' %
              (time.time() - start, len(self.all_houses)))
        start = time.time()

        # Load envs
        self.env_loaded = {}
        for i in range(len(self.all_houses)):
            print('[%02d/%d][split:%s][gpu:%d][house:%s]' %
                  (i + 1, len(self.all_houses), self.split, self.gpu_id,
                   self.all_houses[i].house['id']))
            environment = Environment(self.api_threads[i], self.all_houses[i],
                                      self.cfg)
            self.env_loaded[self.all_houses[i].house['id']] = House3DUtils(
                environment,
                target_obj_conn_map_dir=self.target_obj_conn_map_dir,
                build_graph=False)

        # [TODO] Unused till now
        self.env_ptr = -1

        print('[%.02f] Loaded %d house3d envs' %
              (time.time() - start, len(self.env_loaded)))

        # Mark available data indices
        self.available_idx = [
            i for i, v in enumerate(self.env_list) if v in self.env_loaded
        ]

        # [TODO] only keeping legit sequences
        # needed for things to play well with old data
        temp_available_idx = self.available_idx.copy()
        for i in range(len(temp_available_idx)):
            if self.action_lengths[temp_available_idx[i]] < 5:
                self.available_idx.remove(temp_available_idx[i])

        print('Available inds: %d' % len(self.available_idx))

        # Flag to check if loaded envs have been cycled through or not
        # [TODO] Unused till now
        self.all_envs_loaded = False

    def _clear_api_threads(self):
        for i in range(len(self.api_threads)):
            del self.api_threads[0]
        self.api_threads = []

    def _clear_memory(self):
        if hasattr(self, 'episode_house'):
            del self.episode_house
        if hasattr(self, 'env_loaded'):
            del self.env_loaded
        if hasattr(self, 'api_threads'):
            del self.api_threads
        self.api_threads = []

    def _check_if_all_envs_loaded(self):
        print('[CHECK][Cache:%d][Total:%d]' %
              (len(self.img_data_cache), len(self.env_list)))
        if len(self.img_data_cache) == len(self.env_list):
            self.available_idx = [i for i, v in enumerate(self.env_list)]
            return True
        else:
            return False

    def set_camera(self, e, pos, robot_height=1.0):
        assert len(pos) == 4

        e.env.cam.pos.x = pos[0]
        e.env.cam.pos.y = robot_height
        e.env.cam.pos.z = pos[2]
        e.env.cam.yaw = pos[3]

        e.env.cam.updateDirection()

    def render(self, e):
        return e.env.render()

    def get_frames(self, e, pos_queue, preprocess=True):
        if isinstance(pos_queue, list) == False:
            pos_queue = [pos_queue]

        res = []
        for i in range(len(pos_queue)):
            self.set_camera(e, pos_queue[i])
            img = np.array(self.render(e), copy=False, dtype=np.float32)

            if preprocess == True:
                img = img.transpose(2, 0, 1)
                img = img / 255.0

            res.append(img)

        return np.array(res)

    def get_hierarchical_features_till_spawn(self,
                                             actions,
                                             backtrack_steps=0,
                                             max_controller_actions=5):

        action_length = len(actions) - 1
        pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(
            actions=actions, controller_action_lim=max_controller_actions)

        # count how many actions of same type have been encountered pefore starting navigation
        backtrack_controller_steps = actions[1:action_length -
                                             backtrack_steps + 1:][::-1]
        counter = 0
        # Removed try/except here to try to tease out pdb-related errors in Abhishek's code that are firing in other
        # parts of training for me as well.
        # try:
        if len(backtrack_controller_steps) > 0:
            # Edited condition: counter <= len(backtrack_controller_steps to strictly less than to avoid out of bounds
            # error on following loop; unsure what cascading problems that might cause since I don't know the downsteam
            # logic for how counter is used, but the loop as written was asking for a bug in execution get getting it.
            # I also reversed the order of the conditions so that the index check is -after- the verification that
            # counter is within bounds, since otherwise it doesn't fire until after the out of bounds error has
            # happened (tho, again, maybe this will cause downstream issues if counter is supposed to be allowed to
            # float up to value len(backtrack_controller_steps) + 1, which is now higher than it can reach).
            while ((counter <= self.max_controller_actions)
                   and (counter < len(backtrack_controller_steps))
                   and (backtrack_controller_steps[counter]
                        == backtrack_controller_steps[0])):
                counter += 1
        # except:
        # import pdb;
        # pdb.set_trace() #If you have breakpoint here, you probably found an error in the logit above to figure out the correct counter step.  Still working on this and checking.

        target_pos_idx = action_length - backtrack_steps

        controller_step = True
        if target_pos_idx in pq_idx:
            controller_step = False

        pq_idx_pruned = [v for v in pq_idx if v <= target_pos_idx]
        pa_pruned = pa[:len(pq_idx_pruned) + 1]

        images = self.get_frames(self.episode_house,
                                 self.episode_pos_queue,
                                 preprocess=True)
        raw_img_feats = self.cnn(Variable(
            torch.FloatTensor(images).cuda())).data.cpu().numpy().copy()

        controller_img_feat = torch.from_numpy(
            raw_img_feats[target_pos_idx].copy())
        controller_action_in = pa_pruned[-1] - 2

        planner_img_feats = torch.from_numpy(
            raw_img_feats[pq_idx_pruned].copy())
        planner_actions_in = torch.from_numpy(np.array(pa_pruned[:-1]) - 1)

        return planner_actions_in, planner_img_feats, controller_step, controller_action_in, \
            controller_img_feat, self.episode_pos_queue[target_pos_idx], counter

    def __getitem__(self, index):
        # [VQA] question-only
        if self.input_type == 'ques':
            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            return (idx, question, answer)

        # [VQA] question+image
        elif self.input_type == 'ques,image':
            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            actions_in = actions[action_length - self.num_frames:action_length]
            actions_out = actions[action_length - self.num_frames +
                                  1:action_length + 1]

            if self.to_cache == True and index in self.img_data_cache:
                images = self.img_data_cache[index]
            else:
                pos_queue = self.pos_queue[index][
                    -self.num_frames:]  # last 5 frames
                images = self.get_frames(self.env_loaded[self.env_list[index]],
                                         pos_queue,
                                         preprocess=True)
                if self.to_cache == True:
                    self.img_data_cache[index] = images.copy()

            return (idx, question, answer, images, actions_in, actions_out,
                    action_length)

        # [NAV] question+cnn
        elif self.input_type in ['cnn', 'cnn+q']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            if self.to_cache == True and index in self.img_data_cache:
                img_feats = self.img_data_cache[index]
            else:
                pos_queue = self.pos_queue[index]
                images = self.get_frames(self.env_loaded[self.env_list[index]],
                                         pos_queue,
                                         preprocess=True)
                img_feats = self.cnn(Variable(
                    torch.FloatTensor(
                        images).cuda())).data.cpu().numpy().copy()
                if self.to_cache == True:
                    self.img_data_cache[index] = img_feats

            # for val or test (evaluation), or
            # when target_obj_conn_map_dir is defined (reinforce),
            # load entire shortest path navigation trajectory
            # and load connectivity map for intermediate rewards
            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[
                        self.env_list[index]].objects[obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[
                        self.env_list[index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[
                        self.env_list[index]].objects[target_obj_id],
                    target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[
                    self.env_list[index]].objects[target_obj_id]

                actions_in = actions[:action_length]
                actions_out = actions[1:action_length + 1] - 2

                return (idx, question, answer, img_feats, actions_in,
                        actions_out, action_length)

            # if action_length is n
            # images.shape[0] is also n
            # actions[0] is <START>
            # actions[n] is <END>

            # grab 5 random frames
            # [NOTE]: this'll break for longer-than-5 navigation sequences
            start_idx = np.random.choice(img_feats.shape[0] + 1 -
                                         self.num_frames)
            img_feats = img_feats[start_idx:start_idx + self.num_frames]

            actions_in = actions[start_idx:start_idx + self.num_frames]
            actions_out = actions[start_idx + self.num_frames] - 2

            return (idx, question, answer, img_feats, actions_in, actions_out,
                    action_length)

        # [NAV] question+lstm
        elif self.input_type in ['lstm', 'lstm+q']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            if self.split == 'train':
                if self.to_cache == True and index in self.img_data_cache:
                    img_feats = self.img_data_cache[index]
                else:
                    pos_queue = self.pos_queue[index]
                    images = self.get_frames(
                        self.env_loaded[self.env_list[index]],
                        pos_queue,
                        preprocess=True)
                    raw_img_feats = self.cnn(
                        Variable(torch.FloatTensor(
                            images).cuda())).data.cpu().numpy().copy()
                    img_feats = np.zeros(
                        (self.actions.shape[1], raw_img_feats.shape[1]),
                        dtype=np.float32)
                    img_feats[:raw_img_feats.shape[0], :] = raw_img_feats.copy(
                    )
                    if self.to_cache == True:
                        self.img_data_cache[index] = img_feats

            actions_in = actions.clone() - 1
            actions_out = actions[1:].clone() - 2

            actions_in[action_length:].fill_(0)
            mask = actions_out.clone().gt(-1)
            if len(actions_out) > action_length:
                actions_out[action_length:].fill_(0)

            # for val or test (evaluation), or
            # when target_obj_conn_map_dir is defined (reinforce),
            # load entire shortest path navigation trajectory
            # and load connectivity map for intermediate rewards
            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[
                        self.env_list[index]].objects[obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[
                        self.env_list[index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[
                        self.env_list[index]].objects[target_obj_id],
                    target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[
                    self.env_list[index]].objects[target_obj_id]

                return (idx, question, answer, False, actions_in, actions_out,
                        action_length, mask)

            return (idx, question, answer, img_feats, actions_in, actions_out,
                    action_length, mask)

        # [NAV] planner-controller
        elif self.input_type in ['pacman']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            planner_actions = self.planner_actions[index]
            controller_actions = self.controller_actions[index]

            planner_action_length = self.planner_action_lengths[index]
            controller_action_length = self.controller_action_lengths[index]

            planner_hidden_idx = self.planner_hidden_idx[index]

            if self.split == 'train':
                if self.to_cache == True and index in self.img_data_cache:
                    img_feats = self.img_data_cache[index]
                else:
                    pos_queue = self.pos_queue[index]
                    images = self.get_frames(
                        self.env_loaded[self.env_list[index]],
                        pos_queue,
                        preprocess=True)
                    raw_img_feats = self.cnn(
                        Variable(torch.FloatTensor(
                            images).cuda())).data.cpu().numpy().copy()
                    img_feats = np.zeros(
                        (self.actions.shape[1], raw_img_feats.shape[1]),
                        dtype=np.float32)
                    img_feats[:raw_img_feats.shape[0], :] = raw_img_feats.copy(
                    )
                    if self.to_cache == True:
                        self.img_data_cache[index] = img_feats

            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[
                        self.env_list[index]].objects[obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[
                        self.env_list[index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                if target_obj_id == False or target_room == False:
                    return None
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[
                        self.env_list[index]].objects[target_obj_id],
                    target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[
                    self.env_list[index]].objects[target_obj_id]

                return (idx, question, answer, actions, action_length)

            planner_pos_queue_idx = self.planner_pos_queue_idx[index]
            controller_pos_queue_idx = self.controller_pos_queue_idx[index]

            planner_img_feats = np.zeros(
                (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32)
            planner_img_feats[:planner_action_length] = img_feats[
                planner_pos_queue_idx]

            planner_actions_in = planner_actions.clone() - 1
            planner_actions_out = planner_actions[1:].clone() - 2

            planner_actions_in[planner_action_length:].fill_(0)
            planner_mask = planner_actions_out.clone().gt(-1)
            if len(planner_actions_out) > planner_action_length:
                planner_actions_out[planner_action_length:].fill_(0)

            controller_img_feats = np.zeros(
                (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32)
            controller_img_feats[:controller_action_length] = img_feats[
                controller_pos_queue_idx]

            controller_actions_in = actions[1:].clone() - 2
            if len(controller_actions_in) > controller_action_length:
                controller_actions_in[controller_action_length:].fill_(0)

            controller_out = controller_actions
            controller_mask = controller_out.clone().gt(-1)
            if len(controller_out) > controller_action_length:
                controller_out[controller_action_length:].fill_(0)

            # zero out forced controller return
            for i in range(controller_action_length):
                if i >= self.max_controller_actions - 1 and controller_out[i] == 0 and \
                        (self.max_controller_actions == 1 or
                         controller_out[i - self.max_controller_actions + 1:i].sum()
                         == self.max_controller_actions - 1):
                    controller_mask[i] = 0

            return (idx, question, answer, planner_img_feats,
                    planner_actions_in, planner_actions_out,
                    planner_action_length, planner_mask, controller_img_feats,
                    controller_actions_in, planner_hidden_idx, controller_out,
                    controller_action_length, controller_mask)

    def __len__(self):
        if self.input_type == 'ques':
            return len(self.questions)
        else:
            return len(self.available_idx)
예제 #5
0
class EqaDataset(Dataset):
    def __init__(self,
                 questions_h5,
                 vocab,
                 num_frames=1,
                 data_json=False,
                 split='train',
                 gpu_id=0,
                 input_type='ques',
                 max_threads_per_gpu=10,
                 to_cache=False,
                 target_obj_conn_map_dir=False,
                 map_resolution=1000,
                 overfit=False,
                 max_controller_actions=5,
                 max_actions=None):

        self.questions_h5 = questions_h5
        self.vocab = load_vocab(vocab)
        self.num_frames = num_frames
        self.max_controller_actions = max_controller_actions

        np.random.seed()

        self.data_json = data_json
        self.split = split
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.gpu_id = gpu_id

        self.input_type = input_type

        self.max_threads_per_gpu = max_threads_per_gpu

        self.target_obj_conn_map_dir = target_obj_conn_map_dir
        self.map_resolution = map_resolution
        self.overfit = overfit

        self.to_cache = to_cache
        self.img_data_cache = {}

        print('Reading question data into memory')
        # self.idx -> Object ID
        self.idx = _dataset_to_tensor(questions_h5['idx'])
        self.questions = _dataset_to_tensor(questions_h5['questions'])
        self.answers = _dataset_to_tensor(questions_h5['answers'])
        self.actions = _dataset_to_tensor(questions_h5['action_labels'])
        self.action_lengths = _dataset_to_tensor(
            questions_h5['action_lengths'])
        self.cfg = load_config('../../House3D/tests/config.json')

        # Saty: max_actions is None!
        if max_actions:  # max actions will allow us to create arrays of a certain length.  Helpful if you only want to train with 10 actions.
            assert isinstance(max_actions, int)
            num_data_items = self.actions.shape[0]
            new_actions = np.zeros(
                (num_data_items, max_actions + 2),
                dtype=np.int64)  #Saty: WHY +2? -> for <start> and <end>
            new_lengths = np.ones(
                (num_data_items, ), dtype=np.int64) * max_actions

            for i in range(num_data_items):
                action_length = int(self.action_lengths[i])
                new_actions[i, 0] = 1
                new_actions[i, 1:max_actions + 1] = self.actions[
                    i, action_length - max_actions:action_length].numpy()
            self.actions = torch.LongTensor(new_actions)
            self.action_lengths = torch.LongTensor(new_lengths)

        if self.data_json != False:
            data = json.load(open(self.data_json, 'r'))
            self.envs = data['envs']  #Satyen: Gold mine!

            self.env_idx = data[self.split + '_env_idx']
            self.env_list = [self.envs[x] for x in self.env_idx]
            self.env_set = list(set(self.env_list))
            self.env_set.sort()

            if self.overfit == True:
                self.env_idx = self.env_idx[:1]
                self.env_set = self.env_list = [
                    self.envs[x] for x in self.env_idx
                ]
                print('Trying to overfit to [house %s]' % self.env_set[0])
                logging.info('Trying to overfit to [house {}]'.format(
                    self.env_set[0]))

            print('Total envs: %d' % len(list(set(self.envs))))
            print('Envs in %s: %d' %
                  (self.split, len(list(set(self.env_idx)))))

            if input_type != 'ques':
                ''''
                If training, randomly sample and load a subset of environments,
                train on those, and then cycle through to load the rest.

                On the validation and test set, load in order, and cycle through.

                For both, add optional caching so that if all environments
                have been cycled through once, then no need to re-load and
                instead, just the cache can be used.
                '''

                self.api_threads = []
                self._load_envs(start_idx=0, in_order=True)

                cnn_kwargs = {'num_classes': 191, 'pretrained': True}
                self.cnn = MultitaskCNN(**cnn_kwargs)
                self.cnn.eval()
                self.cnn.to(self.device)

            self.pos_queue = data[self.split + '_pos_queue']
            self.boxes = data[self.split + '_boxes']

            if max_actions:
                for i in range(len(self.pos_queue)):
                    self.pos_queue[i] = self.pos_queue[i][-1 * max_actions:]

        if input_type == 'pacman':

            self.planner_actions = self.actions.clone().fill_(0)
            self.controller_actions = self.actions.clone().fill_(-1)

            self.planner_action_lengths = self.action_lengths.clone().fill_(0)
            self.controller_action_lengths = self.action_lengths.clone().fill_(
                0)

            self.planner_hidden_idx = self.actions.clone().fill_(0)

            self.planner_pos_queue_idx, self.controller_pos_queue_idx = [], []

            # parsing flat actions to planner-controller hierarchy
            print(" Parsing flat actions to planner-controller hierarchy")
            for i in tqdm(range(len(self.actions))):

                pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(
                    actions=self.actions[i]
                    [:self.action_lengths[i] +
                     1],  # Saty: Take all actions essentially ; This doesn't have <start> and <end>
                    controller_action_lim=max_controller_actions
                )  # saty: This is 5

                self.planner_actions[i][:len(pa)] = torch.Tensor(pa)
                self.controller_actions[i][:len(ca)] = torch.Tensor(ca)

                self.planner_action_lengths[i] = len(pa) - 1
                self.controller_action_lengths[i] = len(ca)

                self.planner_pos_queue_idx.append(pq_idx)
                self.controller_pos_queue_idx.append(
                    cq_idx)  # Saty: This is just [1,2,3,4/.. , len(actions)]

                self.planner_hidden_idx[i][:len(ca)] = torch.Tensor(ph_idx)

    def _pick_envs_to_load(self,
                           split='train',
                           max_envs=10,
                           start_idx=0,
                           in_order=False):
        if split in ['val', 'test'] or in_order == True:
            pruned_env_set = self.env_set[start_idx:start_idx + max_envs]
        else:
            if max_envs < len(self.env_set):
                env_inds = np.random.choice(len(self.env_set),
                                            max_envs,
                                            replace=False)
            else:
                env_inds = np.random.choice(len(self.env_set),
                                            max_envs,
                                            replace=True)
            pruned_env_set = [self.env_set[x] for x in env_inds]
        return pruned_env_set

    def _load_envs(self, start_idx=-1, in_order=False):
        self._clear_api_threads()
        self._clear_memory()
        if start_idx == -1:
            start_idx = self.env_set.index(self.pruned_env_set[-1]) + 1

        # Pick envs
        self.pruned_env_set = self._pick_envs_to_load(
            split=self.split,
            max_envs=self.max_threads_per_gpu,
            start_idx=start_idx,
            in_order=in_order)

        if len(self.pruned_env_set) == 0:
            return

        # Load api threads
        #self._clear_api_threads()
        start = time.time()
        if len(self.api_threads) == 0:
            for i in range(self.max_threads_per_gpu):
                api_temp = None
                api_temp = objrender.RenderAPIThread(w=224,
                                                     h=224,
                                                     device=self.gpu_id)
                self.api_threads.append(api_temp)

        #try:
        # self.cfg = load_config('/home/satyen/GitHub_repos/our_EQA/House3D/tests/config.json')
        #    self.cfg = load_config('../House3D/tests/config.json')
        #except:
        #    self.cfg = load_config('../../House3D/tests/config.json')

        print('[%.02f] Loaded %d api threads' %
              (time.time() - start, len(self.api_threads)))
        start = time.time()

        # Load houses
        from multiprocessing import Pool
        _args = ([h, self.cfg, self.map_resolution]
                 for h in self.pruned_env_set)
        with Pool(len(self.pruned_env_set)) as pool:
            self.all_houses = pool.starmap(local_create_house, _args)

        print('[%.02f] Loaded %d houses' %
              (time.time() - start, len(self.all_houses)))
        start = time.time()

        # Load envs
        self.env_loaded = {}
        for i in range(len(self.all_houses)):
            print('[%02d/%d][split:%s][gpu:%d][house:%s]' %
                  (i + 1, len(self.all_houses), self.split, self.gpu_id,
                   self.all_houses[i].house['id']))
            environment = Environment(self.api_threads[i], self.all_houses[i],
                                      self.cfg)
            self.env_loaded[self.all_houses[i].house['id']] = House3DUtils(
                environment,
                target_obj_conn_map_dir=self.target_obj_conn_map_dir,
                build_graph=False)

        # [TODO] Unused till now
        self.env_ptr = -1

        print('[%.02f] Loaded %d house3d envs' %
              (time.time() - start, len(self.env_loaded)))
        # CM: has to be 770
        # Mark available data indices
        self.available_idx = [
            i for i, v in enumerate(self.env_list) if v in self.env_loaded
        ]

        # [TODO] only keeping legit sequences
        # needed for things to play well with old data
        temp_available_idx = self.available_idx.copy()
        for i in range(len(temp_available_idx)):
            if self.action_lengths[temp_available_idx[i]] < 5:
                self.available_idx.remove(temp_available_idx[i])

        print('Available inds: %d' % len(self.available_idx))

        # Flag to check if loaded envs have been cycled through or not
        # [TODO] Unused till now
        self.all_envs_loaded = False

    def _clear_api_threads(self):
        for i in range(len(self.api_threads)):
            del self.api_threads[0]
        self.api_threads = []

    def _clear_memory(self):
        if hasattr(self, 'episode_house'):
            del self.episode_house
        if hasattr(self, 'env_loaded'):
            del self.env_loaded
        #if hasattr(self, 'api_threads'):
        #    del self.api_threads
        #self.api_threads = []

    def _check_if_all_envs_loaded(self):
        print('[CHECK][Cache:%d][Total:%d]' %
              (len(self.img_data_cache), len(self.env_list)))
        if len(self.img_data_cache) == len(self.env_list):
            self.available_idx = [i for i, v in enumerate(self.env_list)]
            return True
        else:
            return False

    def set_camera(self, e, pos, robot_height=1.0):
        assert len(pos) == 4

        e.env.cam.pos.x = pos[0]
        e.env.cam.pos.y = robot_height
        e.env.cam.pos.z = pos[2]
        e.env.cam.yaw = pos[3]

        e.env.cam.updateDirection()

    def render(self, e):
        return e.env.render()

    def get_frames(self, e, pos_queue, preprocess=True):
        if isinstance(pos_queue, list) == False:
            pos_queue = [pos_queue]

        res = []
        for i in range(len(pos_queue)):
            self.set_camera(e, pos_queue[i])
            img = np.array(self.render(e), copy=False, dtype=np.float32)

            if preprocess == True:
                img = img.transpose(2, 0, 1)
                img = img / 255.0

            res.append(img)

        return np.array(res)

    # Confused about this function!
    def get_hierarchical_features_till_spawn(self,
                                             actions,
                                             backtrack_steps=0,
                                             max_controller_actions=5):

        action_length = len(actions) - 1
        pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(
            actions=actions, controller_action_lim=max_controller_actions)

        # count how many actions of same type have been encountered pefore starting navigation
        # Not used in train_eval -> train()
        backtrack_controller_steps = actions[1:action_length -
                                             backtrack_steps + 1:][::-1]
        counter = 0
        try:
            if len(backtrack_controller_steps) > 0:
                while (counter <= self.max_controller_actions) and (
                        counter < len(backtrack_controller_steps)) and (
                            backtrack_controller_steps[counter]
                            == backtrack_controller_steps[0]):
                    counter += 1
        except:
            import pdb
            pdb.set_trace(
            )  #If you have breakpoint here, you probably found an error in the logit above to figure out the correct counter step.  Still working on this and checking.

        #####################################################################################
        target_pos_idx = action_length - backtrack_steps

        controller_step = True
        if target_pos_idx in pq_idx:
            controller_step = False

        pq_idx_pruned = [v for v in pq_idx if v <= target_pos_idx]
        pa_pruned = pa[:len(pq_idx_pruned) + 1]

        images = self.get_frames(self.episode_house,
                                 self.episode_pos_queue,
                                 preprocess=True)
        raw_img_feats = self.cnn(
            Variable(torch.FloatTensor(images).to(
                self.device))).data.cpu().numpy().copy()

        controller_img_feat = torch.from_numpy(
            raw_img_feats[target_pos_idx].copy())
        # Last action taken by the planner!
        controller_action_in = pa_pruned[-1] - 2

        planner_img_feats = torch.from_numpy(
            raw_img_feats[pq_idx_pruned].copy())
        planner_actions_in = torch.from_numpy(np.array(pa_pruned[:-1]) - 1)

        return planner_actions_in, planner_img_feats, controller_step, controller_action_in, \
            controller_img_feat, self.episode_pos_queue[target_pos_idx], counter

    def __getitem__(self, index):
        # [VQA] question-only
        if self.input_type == 'ques':
            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            return (idx, question, answer)

        # [VQA] question+image
        elif self.input_type == 'ques,image':
            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            actions_in = actions[action_length - self.num_frames:action_length]
            actions_out = actions[action_length - self.num_frames +
                                  1:action_length + 1]

            if self.to_cache == True and index in self.img_data_cache:
                images = self.img_data_cache[index]
            else:
                pos_queue = self.pos_queue[index][
                    -self.num_frames:]  # last 5 frames
                images = self.get_frames(self.env_loaded[self.env_list[index]],
                                         pos_queue,
                                         preprocess=True)
                if self.to_cache == True:
                    self.img_data_cache[index] = images.copy()

            return (idx, question, answer, images, actions_in, actions_out,
                    action_length)

        # [NAV] question+cnn
        elif self.input_type in ['cnn', 'cnn+q']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            if self.to_cache == True and index in self.img_data_cache:
                img_feats = self.img_data_cache[index]
            else:
                pos_queue = self.pos_queue[index]
                images = self.get_frames(self.env_loaded[self.env_list[index]],
                                         pos_queue,
                                         preprocess=True)
                img_feats = self.cnn(
                    Variable(torch.FloatTensor(images).to(
                        self.device))).data.cpu().numpy().copy()
                if self.to_cache == True:
                    self.img_data_cache[index] = img_feats

            # for val or test (evaluation), or
            # when target_obj_conn_map_dir is defined (reinforce),
            # load entire shortest path navigation trajectory
            # and load connectivity map for intermediate rewards
            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[
                        self.env_list[index]].objects[obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[
                        self.env_list[index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[
                        self.env_list[index]].objects[target_obj_id],
                    target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[
                    self.env_list[index]].objects[target_obj_id]

                actions_in = actions[:action_length]
                actions_out = actions[1:action_length + 1] - 2

                return (idx, question, answer, img_feats, actions_in,
                        actions_out, action_length)

            # if action_length is n
            # images.shape[0] is also n
            # actions[0] is <START>
            # actions[n] is <END>

            # grab 5 random frames
            # [NOTE]: this'll break for longer-than-5 navigation sequences
            start_idx = np.random.choice(img_feats.shape[0] + 1 -
                                         self.num_frames)
            img_feats = img_feats[start_idx:start_idx + self.num_frames]

            actions_in = actions[start_idx:start_idx + self.num_frames]
            actions_out = actions[start_idx + self.num_frames] - 2

            return (idx, question, answer, img_feats, actions_in, actions_out,
                    action_length)

        # [NAV] question+lstm
        elif self.input_type in ['lstm', 'lstm+q']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            if self.split == 'train':
                if self.to_cache == True and index in self.img_data_cache:
                    img_feats = self.img_data_cache[index]
                else:
                    pos_queue = self.pos_queue[index]
                    images = self.get_frames(
                        self.env_loaded[self.env_list[index]],
                        pos_queue,
                        preprocess=True)
                    raw_img_feats = self.cnn(
                        Variable(torch.FloatTensor(images).to(
                            self.device))).data.cpu().numpy().copy()
                    img_feats = np.zeros(
                        (self.actions.shape[1], raw_img_feats.shape[1]),
                        dtype=np.float32)
                    img_feats[:raw_img_feats.shape[0], :] = raw_img_feats.copy(
                    )
                    if self.to_cache == True:
                        self.img_data_cache[index] = img_feats

            actions_in = actions.clone() - 1
            actions_out = actions[1:].clone() - 2

            actions_in[action_length:].fill_(0)
            mask = actions_out.clone().gt(-1)
            if len(actions_out) > action_length:
                actions_out[action_length:].fill_(0)

            # for val or test (evaluation), or
            # when target_obj_conn_map_dir is defined (reinforce),
            # load entire shortest path navigation trajectory
            # and load connectivity map for intermediate rewards
            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']

                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[
                        self.env_list[index]].objects[obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break

                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]

                for room in self.env_loaded[
                        self.env_list[index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[
                        self.env_list[index]].objects[target_obj_id],
                    target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[
                    self.env_list[index]].objects[target_obj_id]

                return (idx, question, answer, False, actions_in, actions_out,
                        action_length, mask)

            return (idx, question, answer, img_feats, actions_in, actions_out,
                    action_length, mask)

        # [NAV] planner-controller
        elif self.input_type in ['pacman']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            planner_actions = self.planner_actions[index]
            controller_actions = self.controller_actions[index]

            planner_action_length = self.planner_action_lengths[index]
            controller_action_length = self.controller_action_lengths[index]

            planner_hidden_idx = self.planner_hidden_idx[index]

            if self.split == 'train':
                if self.to_cache == True and index in self.img_data_cache:
                    img_feats = self.img_data_cache[index]
                else:
                    pos_queue = self.pos_queue[index]
                    images = self.get_frames(
                        self.env_loaded[self.env_list[index]],
                        pos_queue,
                        preprocess=True)

                    raw_img_feats = self.cnn(
                        Variable(torch.FloatTensor(images).to(
                            self.device))).data.cpu().numpy().copy()

                    # Saty: Actions or which there are no image features?
                    # Raw img_feats.shape[1] = 3200
                    img_feats = np.zeros(
                        (self.actions.shape[1], raw_img_feats.shape[1]),
                        dtype=np.float32)

                    img_feats[:raw_img_feats.shape[0], :] = raw_img_feats.copy(
                    )

                    if self.to_cache == True:
                        self.img_data_cache[index] = img_feats

            # LOL! Goes into this- since we're giving target+obj_conn_map_dir
            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                min_ = 1
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]
                trg_obj_name = bbox_obj['name']
                bbox_obj = bbox_obj['box']
                bbox_obj_min = np.array([bbox_obj['min'][x] for x in range(3)])
                bbox_obj_max = np.array([bbox_obj['max'][x] for x in range(3)])
                min_print = 0
                max_print = 0
                # print("target obj bbox:", bbox_obj_min, bbox_obj_max)
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[
                        self.env_list[index]].objects[obj_id]['bbox']
                    ############################ SATYEN ######################################
                    # print("BBOX_OBJ:", [bbox_obj['min'][x] for x in range(3)])
                    # print("BOX2 min:", [box2['min'][x] for x in range(3)])
                    # print("BBOX_OBJ:max", [bbox_obj['max'][x] for x in range(3)])
                    # print("Box2:max", [box2['max'][x] for x in range(3)])
                    box2_min = np.array([box2['min'][x] for x in range(3)])
                    box2_max = np.array([box2['max'][x] for x in range(3)])
                    diff_min = np.mean(abs(bbox_obj_min - box2_min))
                    diff_max = np.mean(abs(bbox_obj_max - box2_max))
                    if abs(diff_min + diff_max) / 2 < min_:
                        min_ = (diff_min + diff_max) / 2
                        target_obj_id = obj_id
                        min_print = box2_min
                        max_print = box2_max
                        #obj_iter_id = self.env_loaded[self.env_list[index]].objects[obj_id]['id']
                #sys.exit()

                #    target_obj_id = obj_id
                #    break

                ############################################################################

                #print("targetObj:{}".format(trg_obj_name))
                #print("Target obj_iter",target_obj_id)
                #print("env obj bbox", min_print, max_print)
                ### Satyen: TARGET ROOM #################3333
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                min_ = 1
                bbox_room_min = np.array(
                    [bbox_room['box']['min'][x] for x in range(3)])
                bbox_room_max = np.array(
                    [bbox_room['box']['max'][x] for x in range(3)])

                for room in self.env_loaded[
                        self.env_list[index]].env.house.all_rooms:
                    ################ SATYEN ###############################################
                    # print("Room min", [room['bbox']['min'][x] for x in range(3)])
                    # print("BBox min", [bbox_room['box']['min'][x] for x in range(3)])
                    # print("Room max", [room['bbox']['max'][x] for x in range(3)])
                    # print("BBox max", [bbox_room['box']['max'][x] for x in range(3)])
                    # print([ all(math.isclose(bbox_obj['max'][x], box2['max'][x], abs_tol=1e-6) for x in range(3) )])
                    room_min = np.array(
                        [room['bbox']['min'][x] for x in range(3)])
                    room_max = np.array(
                        [room['bbox']['max'][x] for x in range(3)])
                    #bbox_room_min = np.array([bbox_room['box']['min'][x] for x in range(3)])
                    #bbox_room_max = np.array([bbox_room['box']['max'][x] for x in range(3)])
                    diff_min = np.mean(abs(room_min - bbox_room_min))
                    diff_max = np.mean(abs(room_max - bbox_room_max))

                    if abs(diff_min + diff_max) / 2 < min_:
                        min_ = (diff_min + diff_max) / 2
                        target_room = room
                    #elif min_ == 1:
                    #    target_room = room
                    #if all([math.isclose(room['bbox']['min'][x], bbox_room['box']['min'][x], abs_tol = 0.6) for x in range(3)]) == True and \
                    #    all([math.isclose(room['bbox']['max'][x], bbox_room['box']['max'][x], abs_tol = 0.6) for x in range(3)]) == True:
                    #    target_room = room
                    #    break

                    #########################################################################
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[
                        self.env_list[index]].objects[target_obj_id],
                    target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[
                    self.env_list[index]].objects[target_obj_id]
                #print("Target OBJ!!:from env ", self.target_obj)

                return (idx, question, answer, actions, action_length)

            planner_pos_queue_idx = self.planner_pos_queue_idx[index]
            controller_pos_queue_idx = self.controller_pos_queue_idx[index]

            # Saty: Get img_feats only for the places wherePLNR makes a prediction. Stored in planner_pos_queue_idx
            planner_img_feats = np.zeros(
                (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32)
            planner_img_feats[:planner_action_length] = img_feats[
                planner_pos_queue_idx]

            planner_actions_in = planner_actions.clone(
            ) - 1  # planner actions  \in [0,4] # WHY[0,4]? Where forward = 1?
            planner_actions_in[planner_action_length:].fill_(
                0)  # mask the elements after planner action length = 0

            planner_actions_out = planner_actions[1:].clone(
            ) - 2  # planner actions_out \in [-1.3]
            # -1 -> START 0-> forward.. etc;        ^ALSO! Shifted by -> Makes sense

            planner_mask = planner_actions_out.clone().gt(
                -1)  # gt -> Greater than! :|

            if len(planner_actions_out) > planner_action_length:
                planner_actions_out[planner_action_length:].fill_(0)

            controller_img_feats = np.zeros(
                (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32)

            controller_img_feats[:controller_action_length] = img_feats[
                controller_pos_queue_idx]

            controller_actions_in = actions[1:].clone(
            ) - 2  # passed into the controller itself!
            if len(controller_actions_in) > controller_action_length:
                controller_actions_in[controller_action_length:].fill_(0)

            controller_out = controller_actions
            controller_mask = controller_out.clone().gt(-1)
            if len(controller_out) > controller_action_length:
                controller_out[controller_action_length:].fill_(0)

            # zero out forced controller return
            for i in range(controller_action_length):
                if i >= self.max_controller_actions - 1 and controller_out[i] == 0 and \
                        (self.max_controller_actions == 1 or
                         controller_out[i - self.max_controller_actions + 1:i].sum()
                         == self.max_controller_actions - 1):
                    controller_mask[i] = 0

            return (idx, question, answer, planner_img_feats,
                    planner_actions_in, planner_actions_out,
                    planner_action_length, planner_mask, controller_img_feats,
                    controller_actions_in, planner_hidden_idx, controller_out,
                    controller_action_length, controller_mask)

    def __len__(self):
        if self.input_type == 'ques':
            return len(self.questions)
        else:
            return len(self.available_idx)
예제 #6
0
class EqaDataset(Dataset):
    def __init__(self,
                 questions_h5,
                 vocab,
                 num_frames=1,
                 split='train',
                 gpu_id=0,
                 input_type='ques',
                 max_threads_per_gpu=10,
                 map_resolution=1000):

        self.questions_h5 = questions_h5
        self.vocab = load_vocab(vocab)
        np.random.seed()

        self.split = split
        self.gpu_id = gpu_id
        self.num_frames = num_frames

        self.input_type = input_type

        self.max_threads_per_gpu = max_threads_per_gpu
        self.map_resolution = map_resolution

        print('Reading question data into memory')
        self.questions = _dataset_to_tensor(questions_h5['questions'])
        self.answers = _dataset_to_tensor(questions_h5['answers'])
        self.actions = _dataset_to_tensor(questions_h5['actions'])
        self.actions = self.actions.unsqueeze(2)
        self.robot_positions = _dataset_to_tensor(
            questions_h5['robot_positions'], dtype=np.float32)
        self.action_images = questions_h5['images']
        self.action_lengths = _dataset_to_tensor(
            questions_h5['action_lengths'])
        self.action_masks = _dataset_to_tensor(questions_h5['mask'])

        #if input_type != 'ques':
        '''
        If training, randomly sample and load a subset of environments,
        train on those, and then cycle through to load the rest.

        On the validation and test set, load in order, and cycle through.

        For both, add optional caching so that if all environments
        have been cycled through once, then no need to re-load and
        instead, just the cache can be used.
        '''
        cnn_kwargs = {'num_classes': 191, 'pretrained': True}
        self.cnn = MultitaskCNN(**cnn_kwargs)
        self.cnn.eval()
        self.cnn.cuda()

    def __getitem__(self, index):
        # [VQA] question-only
        if self.input_type in ['pacman']:

            idx = index
            question = self.questions[index]
            #answer = self.answers[index]
            answer = self.answers[index]
            actions = self.actions[index]
            actions_masks = self.action_masks[index]
            robot_positions = self.robot_positions[index]
            action_lengths = self.action_lengths[index]

            if self.split in ['val', 'test']:  #return the data directly
                return (idx, question, answer, actions, robot_positions,
                        action_lengths)

            if self.split == 'train':  #get iamge from data_set
                planner_images = self.action_images[index]
                planner_img_feats = self.cnn(
                    Variable(torch.FloatTensor(
                        planner_images).cuda())).data.cpu().numpy().copy()
                actions_in = actions.clone()
                actions_out = actions[1:].clone()
                actions_masks = actions_masks[:39].clone().gt(0)
                robot_positions = robot_positions.clone()

            return (idx, question, answer, planner_img_feats, actions_in,
                    actions_out, robot_positions, actions_masks,
                    action_lengths)

        elif self.input_type == 'ques,image':
            idx = index
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            actions_in = actions[action_length - self.num_frames:action_length]
            actions_out = actions[action_length - self.num_frames +
                                  1:action_length + 1]

            images = self.action_images[index][
                action_length - self.num_frames:action_length].astype(
                    np.float32)

            return (idx, question, answer, images, actions_in, actions_out,
                    action_length)

    def __len__(self):
        if self.input_type == 'ques':
            return len(self.questions)
        else:
            return len(self.questions)
예제 #7
0
def test(rank, test_model_dir):
    model_kwargs = {'question_vocab': load_vocab(args.vocab_json)}
    model = NavPlannerControllerModel(**model_kwargs)
    checkpoint = torch.load(test_model_dir)  #load check point
    model.load_state_dict(checkpoint['state'])  #create model

    cnn_kwargs = {'num_classes': 191, 'pretrained': True}
    cnn = MultitaskCNN(**cnn_kwargs)
    cnn.eval()
    cnn.cuda()  #create cnn model

    scene = "test-10-obj-00.txt"
    my_env = enviroment.Environment(is_testing=1, testing_file=scene)
    object_exist_list = my_env.ur5.object_type
    print("the objetct which is exist:")
    print(object_exist_list)  #create simulation enviroment

    my_question = Qusetion(object_exist_list)  #create testing question
    testing_questions = my_question.createQueue()
    vocab = my_question.create_vocab()

    for question in testing_questions:
        planner_hidden = None
        max_action = 30
        position = [0, 0]
        action_in_raw = [0]  #start action_in
        actions = []

        print(question['question'])  #question
        questionTokens = my_question.tokenize(question['question'],
                                              punctToRemove=['?'],
                                              addStartToken=False)
        encoded_question_raw = my_question.encode(questionTokens,
                                                  vocab['questionTokenToIdx'])
        encoded_question_raw.append(0)  #encode question
        encoded_question_raw = np.array(encoded_question_raw)
        encoded_question_tensor = _dataset_to_tensor(encoded_question_raw)
        encoded_question = Variable(encoded_question_tensor)
        encoded_question = encoded_question.unsqueeze(0)
        action_times = 0

        while (action_times < max_action):

            #print(planner_img_feats_var.size())
            action_in_tensor = _dataset_to_tensor(action_in_raw)
            action_in = Variable(action_in_tensor)
            action_in = action_in.unsqueeze(0)
            action_in = action_in.unsqueeze(0)

            _, rgb_image_raw = my_env.camera.get_camera_data()
            position_in, planner_img_feats_var = data2input(
                position, rgb_image_raw, cnn)

            output_data, planner_hidden = model.planner_step(
                encoded_question, planner_img_feats_var, action_in,
                position_in, planner_hidden)
            planner_possi = F.log_softmax(output_data, dim=1)
            planner_data = planner_possi.data.numpy()
            planner_data = planner_data[0]
            action_out = np.where(planner_data == np.max(planner_data))
            action_out = action_out[0][0]

            actions.append(action_out)
            action_in_raw = [action_out]
            if action_out == 9:
                print('stop')
                break
            else:
                dx, dy = order2action(action_out)
                position[0] += dx
                position[1] += dy
            action_times += 1

        if len(actions) > 2 and len(actions) < 20:
            action_position = position + position
            my_env.UR5_action(action_position, 2)  #sucking
        elif len(actions) >= 20:  #pushing
            position_start = [0, 0]
            position_end = [0, 0]
            for i in range(len(actions)):
                if i < len(actions) / 2:  #the first step
                    dx, dy = order2action(actions[i])
                    position_start[0] += dx
                    position_start[1] += dy
                    position_end[0] += dx
                    position_end[1] += dy
                else:  #the second step
                    dx, dy = order2action(actions[i])
                    position_end[0] += dx
                    position_end[1] += dy
            action_position = position_start + position_end
            my_env.UR5_action(action_position, 1)  #pushing
예제 #8
0
def test(rank):

    cnn_model_dir = os.path.abspath("../train/models/03_13_h3d_hybrid_cnn.pt")

    vqa_model_kwargs = {
        'vocab': load_vocab(args.vocab_json),
        'checkpoint_path': cnn_model_dir
    }
    vqa_model = VqaLstmCnnAttentionModel(**vqa_model_kwargs)
    vqa_checkpoint = torch.load(args.vqa_weight)  #load checkpoint weights
    vqa_model.load_state_dict(vqa_checkpoint['state'])
    print('--- vqa_model loaded checkpoint ---')

    res_model_dir = os.path.abspath("../train/models/resnet101.pth")
    my_map_cnn = mapCNN(checkpoint_path=res_model_dir)
    map_checkpoint = torch.load('mapcnn.pt',
                                map_location='cpu')  #load checkpoint weights
    my_map_cnn.load_state_dict(map_checkpoint['state'])  #create map model
    print('--- map_model loaded checkpoint ---')

    cnn_kwargs = {
        'num_classes': 191,
        'pretrained': True,
        'checkpoint_path': cnn_model_dir
    }
    cnn = MultitaskCNN(**cnn_kwargs)
    cnn.eval()

    vocab_dir = os.path.abspath("vocab.json")
    vocab_file = open(vocab_dir, 'r', encoding='utf-8')
    vocab = json.load(vocab_file)

    question = args.question
    print(question)
    questionTokens = tokenize(question,
                              punctToRemove=['?'],
                              addStartToken=False)

    encoded_question_raw = encode(questionTokens, vocab['questionTokenToIdx'])
    while (len(encoded_question_raw) < 10):
        encoded_question_raw.append(0)  #encode question
    encoded_question_raw = np.array(encoded_question_raw)
    encoded_question_tensor = _dataset_to_tensor(encoded_question_raw)
    encoded_question = Variable(encoded_question_tensor)
    encoded_question = encoded_question.unsqueeze(0)

    rgb_before = cv.imread(args.rgb_image_before_dir)
    rgb_after = cv.imread(args.rgb_image_after_dir)
    depth_after = cv.imread(args.depth_image_after_dir)
    depth_after = depth_after[0]
    depth_dim = depth_after.shape
    print(depth_dim)

    rgb_after_resize = cv.resize(rgb_after, (256, 256),
                                 interpolation=cv.INTER_AREA)
    # crop and add marking
    depth_after_resize = cv.resize(depth_after, (256, 256),
                                   interpolation=cv.INTER_AREA)
    # crop and add marking

    rgb_tensor, depth_tensor = rgbd2tensor(rgb_after_resize,
                                           depth_after_resize)  #output_heatmap
    heatmap_output = rgbd2heatmap(rgb_tensor, depth_tensor, my_map_cnn)
    f = h5py.File(args.heatmap_output_dir, 'w')
    f['heatmap'] = heatmap_output

    cv.imwrite(args.rgb_image_after_dir, rgb_after_resize)
    cv.imwrite(args.depth_image_after_dir, depth_after_resize)

    before_image_feat = data2input(rgb_before)
    after_image_feat = data2input(rgb_after_resize)

    input_image = [before_image_feat, after_image_feat]
    input_image_feats = Variable(torch.FloatTensor(input_image))
    input_image_feats = input_image_feats.view(1, 2, 3, 224, 224)

    # print(input_image_feats.size())

    #print(input_image.size())
    #print(before_image_feat.size())

    scores, _ = vqa_model(input_image_feats, encoded_question)
    scores = scores.data.numpy()
    scores = scores[0]
    answer_predict = np.where(scores == np.max(scores))
    answer_predict = answer_predict[0][0]
    answer_dic = vocab["answerTokenToIdx"]
    answer = [k for k, v in answer_dic.items() if v == answer_predict]

    print(answer[0])
예제 #9
0
class EqaDataset(Dataset):
    def __init__(self,
                 questions_h5,
                 vocab,
                 num_frames=1,
                 split='train',
                 gpu_id=0,
                 input_type='ques',
                 max_threads_per_gpu=10,
                 map_resolution=1000):

        self.questions_h5 = questions_h5
        self.vocab = load_vocab(vocab)
        np.random.seed()
        
        self.split = split
        self.gpu_id = gpu_id
        self.num_frames = num_frames

        self.input_type = input_type

        self.max_threads_per_gpu = max_threads_per_gpu
        self.map_resolution = map_resolution


        print('Reading question data into memory')
        self.questions = _dataset_to_tensor(questions_h5['questions'])
        self.answers = _dataset_to_tensor(questions_h5['answers'])
        self.actions = _dataset_to_tensor(questions_h5['actions'])
        self.actions = self.actions.unsqueeze(2)
        self.robot_positions = _dataset_to_tensor(questions_h5['robot_positions'],dtype = np.float32)
        self.action_images = questions_h5['images']
        self.action_maps = questions_h5['heatmaps']
        self.action_lengths = _dataset_to_tensor(questions_h5['action_lengths'])
        self.action_masks = _dataset_to_tensor(questions_h5['mask'])


        cnn_kwargs = {'num_classes': 191, 'pretrained': True}
        self.cnn = MultitaskCNN(**cnn_kwargs)
        self.cnn.eval()
        self.cnn.cuda()




    
    def __getitem__(self, index):
        # [VQA] question-only
        if self.input_type in ['nomap']:

            idx = index
            question = self.questions[index]
            #answer = self.answers[index]
            answer = self.answers[index]
            if answer > 13:
                answer = answer - 1
            actions = self.actions[index]
            actions_masks = self.action_masks[index]
            robot_positions = self.robot_positions[index]
            action_lengths = self.action_lengths[index]


            if self.split in ['val', 'test']:    #return the data directly
                return (idx, question, answer, actions, robot_positions,action_lengths)  

            if self.split == 'train':                      #get iamge from data_set
                planner_images = self.action_images[index][0]
                planner_var = Variable(torch.FloatTensor(planner_images)
                                 .cuda())
                planner_var = planner_var.unsqueeze(0)
                
                planner_img_feats = self.cnn(planner_var).data.cpu().numpy().copy()  
                                            
                actions_in = actions.clone()
                actions_out = actions[1:].clone()
                actions_masks = actions_masks[:39].clone().gt(0)
                robot_positions = robot_positions.clone()
                     
            return (idx, question, answer, planner_img_feats,
                    actions_in, actions_out,
                   robot_positions, actions_masks,action_lengths)

        elif self.input_type == 'addmap':

            idx = index
            question = self.questions[index]
            #answer = self.answers[index]
            answer = self.answers[index]
            if answer > 13:
                answer = answer - 1
            actions = self.actions[index]
            actions_masks = self.action_masks[index]
            robot_positions = self.robot_positions[index]
            action_lengths = self.action_lengths[index]


            if self.split in ['val', 'test']:    #return the data directly
                return (idx, question, answer, actions, robot_positions,action_lengths)  

            if self.split == 'train':                      #get iamge from data_set
                planner_images = self.action_images[index][0]
                planner_var = Variable(torch.FloatTensor(planner_images)
                                 .cuda())
                planner_var = planner_var.unsqueeze(0)
                
                planner_img_feats = self.cnn(planner_var).data.cpu().numpy().copy()  

                planner_maps = self.action_maps[index][0]
                planner_maps_feats = Variable(torch.FloatTensor(planner_maps)
                                 .cuda())
                

                #planner_maps_feats = planner_maps_var.view(-1,32*32*20)
                                            
                actions_in = actions.clone()
                actions_out = actions[1:].clone()
                actions_masks = actions_masks[:39].clone().gt(0)
                robot_positions = robot_positions.clone()
                     
            return (idx, question, answer, planner_img_feats,
                   planner_maps_feats,actions_in, actions_out,
                   robot_positions, actions_masks,action_lengths)


        elif self.input_type == 'ques,image':
            idx = index
            question = self.questions[index]
            answer = self.answers[index]
            if answer > 13:
                answer = answer - 1

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            actions_in = actions[action_length - self.num_frames:action_length]
            actions_out = actions[action_length - self.num_frames + 1:
                                  action_length + 1]

            images = self.action_images[index][0:2].astype(np.float32)

            return (idx, question, answer, images, actions_in, actions_out,
                    action_length)




    def __len__(self):
        if self.input_type == 'ques':
            return len(self.questions)
        else:
            return len(self.questions)