예제 #1
0
    def __init__(self,
                 config: Dict[str, Union[int, str]],
                 dialogs_jsonpath: str,
                 dense_annotations_jsonpath: Optional[str] = None,
                 overfit: bool = False,
                 in_memory: bool = False):
        super().__init__()
        self.config = config
        self.dialogs_reader = DialogsReader(dialogs_jsonpath)

        if "val" in self.split and dense_annotations_jsonpath is not None:
            self.annotations_reader = DenseAnnotationsReader(dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        self.vocabulary = Vocabulary(
            config["word_counts_json"], min_count=config["vocab_min_count"]
        )

        # initialize image features reader according to split
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory)

        # keep a list of image_ids as primary keys to access data
        self.image_ids = list(self.dialogs_reader.dialogs.keys())
        if overfit:
            self.image_ids = self.image_ids[:5]
    def __init__(self,
                 visdial_jsonpath: str,
                 config: Dict[str, Union[int, str]],
                 overfit: bool = False,
                 in_memory: bool = False):
        super().__init__()
        self.config = config
        self.json_reader = VisDialJsonReader(visdial_jsonpath)
        self.vocabulary = Vocabulary(
            config["word_counts_json"], min_count=config["vocab_min_count"]
        )

        # initialize image features reader according to split
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.json_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.json_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory)

        # keep a list of image_ids as primary keys to access data
        self.image_ids = list(self.json_reader.dialogs.keys())
        if overfit:
            self.image_ids = self.image_ids[:5]
예제 #3
0
    def __init__(
        self,
        config: Dict[str, Any],
        dialogs_jsonpath: str,
        dense_annotations_jsonpath: Optional[str] = None,
        overfit: bool = False,
        in_memory: bool = False,
        return_options: bool = True,
        add_boundary_toks: bool = False,
        sample_flag: bool = False,
    ):
        super().__init__()
        self.config = config
        self.return_options = return_options
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(dialogs_jsonpath)

        if "val" in self.split and dense_annotations_jsonpath is not None:
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        self.vocabulary = Vocabulary(config["word_counts_json"],
                                     min_count=config["vocab_min_count"])

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath,
                                                 in_memory)
        if sample_flag == False:
            self.image_ids = list(self.dialogs_reader.dialogs.keys())
        file = open('data/qt_count.json', 'r')  ##load qt information
        self.qt_file = json.loads(file.read())
        self.qt_list = list(self.qt_file.keys())
        file.close()
        for i in range(len(self.qt_list)):
            self.qt_list[i] = word_tokenize(self.qt_list[i])
        # Keep a list of image_ids as primary keys to access data.
        if sample_flag == True:
            samplefile = open('data/visdial_1.0_train_dense_sample.json', 'r')
            sample = json.loads(samplefile.read())
            samplefile.close()
            ndcg_id_list = []
            for idx in range(len(sample)):
                ndcg_id_list.append(sample[idx]['image_id'])
            self.image_ids = ndcg_id_list

        if overfit:
            self.image_ids = self.image_ids[:5]
예제 #4
0
    def __init__(
        self,
        config: Dict[str, Any],
        dialogs_jsonpath: str,
        dense_annotations_jsonpath: Optional[str] = None,
        return_adjusted_gt_relevance: bool = False,
        overfit: bool = False,
        in_memory: bool = False,
        return_options: bool = True,
        add_boundary_toks: bool = False,
    ):
        super().__init__()
        self.config = config
        self.return_options = return_options
        self.return_adjusted_gt_relevance = return_adjusted_gt_relevance
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(dialogs_jsonpath)

        if (("val" in self.split or "dense" in self.split)
                and dense_annotations_jsonpath is not None):
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        if config['word_embedding_type'] == 'glove':
            self.vocabulary = GloveVocabulary(
                word_counts_path=config['word_counts_json'],
                min_count=config['vocab_min_count'],
                glove_weight_path=config['glove_weight_txt'],
                vec_size=config['glove_emb_dim'],
                glove_vec_num=config['glove_vec_num'])
        else:
            self.vocabulary = Vocabulary(
                word_counts_path=config["word_counts_json"],
                min_count=config["vocab_min_count"])

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if ("val" in self.dialogs_reader.split
                and "fake" not in self.dialogs_reader.split):
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath,
                                                 in_memory)

        # Keep a list of image_ids as primary keys to access data.
        self.image_ids = list(self.dialogs_reader.dialogs.keys())
        if overfit:
            self.image_ids = self.image_ids[:5]
    def __init__(
        self,
        config: Dict[str, Any],
        dialogs_jsonpath: str,
        dense_annotations_jsonpath: Optional[str] = None,
        overfit: bool = False,
        in_memory: bool = False,
        num_workers: int = 1,
        return_options: bool = True,
        add_boundary_toks: bool = False,
    ):
        super().__init__()
        self.config = config
        self.return_options = return_options
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(
            dialogs_jsonpath,
            num_examples=(5 if overfit else None),
            num_workers=num_workers)

        if "val" in self.split and dense_annotations_jsonpath is not None:
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        self.vocabulary = Vocabulary(config["word_counts_json"],
                                     min_count=config["vocab_min_count"])

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath,
                                                 in_memory)

        # Keep a list of image_ids as primary keys to access data.
        self.image_ids = list(self.dialogs_reader.dialogs.keys())
        if overfit:
            self.image_ids = self.image_ids[:5]
예제 #6
0
def test_encoder():

    # a = torch.randn(2,5)
    # b = torch.randn(2,5)
    # c = a+b
    # print(a)
    # print(b)
    # print(c)

    # scores = scores.masked_fill(mask, -1e9)

    img_feat = torch.randn(4, 36, 2048)
    seq_size = 20
    ques = torch.randperm(seq_size).view(1, seq_size)  # (batch,seq_len)
    ques = ques.unsqueeze(1).repeat(4, 10, 1)
    # ques = ques.repeat(4,1)
    ques_len = torch.LongTensor([6, 5, 4, 3]).unsqueeze(1).repeat(1, 10)
    # print(ques_len.size())
    #
    # print(ques.size())  # (4,10,20)
    # print(img_feat.size())

    config = {
        "use_hist": False,
        "use_bert": False,
        "img_feature_size": 2048,
        "word_embedding_size": 300,
        "bert_embedding_size": 768,
        "lstm_hidden_size": 512,
        "lstm_num_layers": 2,
        "dropout": 0.5,
        "word_counts_json": '../data/visdial_1.0_word_counts_train.json',
        "concat_history": False,
        "vocab_min_count": 5
    }

    vocabulary = Vocabulary(config["word_counts_json"],
                            min_count=config["vocab_min_count"])

    net = MCANImgOnlyEncoder(config, vocabulary)
    opts = {'img_feat': img_feat, 'ques': ques, 'ques_len': ques_len}

    fused_embedding = net(opts)
    print(fused_embedding.size())
예제 #7
0
# ================================================================================================
#   INPUT ARGUMENTS AND CONFIG
# ================================================================================================

args = parser.parse_args(args=[])

# keys: {"dataset", "model", "solver"}
config = yaml.load(open(args.config_yml))

# ================================================================================================
#   SETUP DATASET
# ================================================================================================

vocabulary = Vocabulary(
    config["dataset"]["word_counts_json"], min_count=config["dataset"]["vocab_min_count"]
)

def loadGloveModel(gloveFile):
    print("Loading pretrained word vectors...")
    with open(gloveFile,'r') as f:
        model = {}
        for line in f:
            splitLine = line.split()
            word = splitLine[0]
            embedding = np.array([float(val) for val in splitLine[1:]])
            model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

glove = loadGloveModel(args.pretrained_txt)
class VisDialDataset(Dataset):
    def __init__(self,
                 visdial_jsonpath: str,
                 config: Dict[str, Union[int, str]],
                 overfit: bool = False,
                 in_memory: bool = False):
        super().__init__()
        self.config = config
        self.json_reader = VisDialJsonReader(visdial_jsonpath)
        self.vocabulary = Vocabulary(
            config["word_counts_json"], min_count=config["vocab_min_count"]
        )

        # initialize image features reader according to split
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.json_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.json_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory)

        # keep a list of image_ids as primary keys to access data
        self.image_ids = list(self.json_reader.dialogs.keys())
        if overfit:
            self.image_ids = self.image_ids[:5]

    @property
    def split(self):
        return self.json_reader.split

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, index):
        # get image_id, which serves as a primary key for current instance
        image_id = self.image_ids[index]

        # get image features for this image_id using hdf reader
        image_features = self.hdf_reader[image_id]
        image_features = torch.tensor(image_features)
        # normalize image features at zero-th dimension (since there's no batch dimension)
        if self.config["img_norm"]:
            image_features = normalize(image_features, dim=0, p=2)

        # retrieve instance for this image_id using json reader
        visdial_instance = self.json_reader[image_id]
        caption = visdial_instance["caption"]
        dialog = visdial_instance["dialog"]

        # convert word tokens of caption, question, answer and answer options to integers
        caption = self.vocabulary.to_indices(caption)
        for i in range(len(dialog)):
            dialog[i]["question"] = self.vocabulary.to_indices(dialog[i]["question"])
            dialog[i]["answer"] = self.vocabulary.to_indices(dialog[i]["answer"])

            for j in range(len(dialog[i]["answer_options"])):
                dialog[i]["answer_options"][j] = self.vocabulary.to_indices(
                    dialog[i]["answer_options"][j]
                )

        questions, question_lengths = self._pad_sequences(
            [dialog_round["question"] for dialog_round in dialog]
        )
        history, history_lengths = self._get_history(
            caption,
            [dialog_round["question"] for dialog_round in dialog],
            [dialog_round["answer"] for dialog_round in dialog]
        )

        answer_options = []
        answer_option_lengths = []
        for dialog_round in dialog:
            options, option_lengths = self._pad_sequences(dialog_round["answer_options"])
            answer_options.append(options)
            answer_option_lengths.append(option_lengths)
        answer_options = torch.stack(answer_options, 0)

        if "test" not in self.split:
            answer_indices = [dialog_round["gt_index"] for dialog_round in dialog]

        # collect everything as tensors for ``collate_fn`` of dataloader to work seemlessly
        # questions, history, etc. are converted to LongTensors, for nn.Embedding input
        item = {}
        item["img_ids"] = torch.tensor(image_id).long()
        item["img_feat"] = image_features
        item["ques"] = questions.long()
        item["hist"] = history.long()
        item["opt"] = answer_options.long()
        item["ques_len"] = torch.tensor(question_lengths).long()
        item["hist_len"] = torch.tensor(history_lengths).long()
        item["opt_len"] = torch.tensor(answer_option_lengths).long()
        item["num_rounds"] = torch.tensor(visdial_instance["num_rounds"]).long()
        if "test" not in self.split:
            item["ans_ind"] = torch.tensor(answer_indices).long()
        return item

    def _pad_sequences(self, sequences: List[List[int]]):
        """Given tokenized sequences (either questions, answers or answer options, tokenized
        in ``__getitem__``), padding them to maximum specified sequence length. Return as a
        tensor of size ``(*, max_sequence_length)``.

        This method is only called in ``__getitem__``, chunked out separately for readability.

        Parameters
        ----------
        sequences : List[List[int]]
            List of tokenized sequences, each sequence is typically a List[int].

        Returns
        -------
        torch.Tensor, torch.Tensor
            Tensor of sequences padded to max length, and length of sequences before padding.
        """

        for i in range(len(sequences)):
            sequences[i] = sequences[i][: self.config["max_sequence_length"] - 1]
        sequence_lengths = [len(sequence) for sequence in sequences]

        # pad all sequences to max_sequence_length
        maxpadded_sequences = torch.full(
            (len(sequences), self.config["max_sequence_length"]),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_sequences = pad_sequence(
            [torch.tensor(sequence) for sequence in sequences],
            batch_first=True, padding_value=self.vocabulary.PAD_INDEX
        )
        maxpadded_sequences[:, :padded_sequences.size(1)] = padded_sequences
        return maxpadded_sequences, sequence_lengths

    def _get_history(self,
                     caption: List[int],
                     questions: List[List[int]],
                     answers: List[List[int]]):
        # allow double length of caption, equivalent to a concatenated QA pair
        caption = caption[: self.config["max_sequence_length"] * 2 - 1]

        for i in range(len(questions)):
            questions[i] = questions[i][: self.config["max_sequence_length"] - 1]

        for i in range(len(answers)):
            answers[i] = answers[i][: self.config["max_sequence_length"] - 1]

        # history for first round is caption, else concatenated QA pair of previous round
        history = []
        history.append(caption)
        for question, answer in zip(questions, answers):
            history.append(question + answer + [self.vocabulary.EOS_INDEX])
        # drop last entry from history (there's no eleventh question)
        history = history[:-1]
        max_history_length = self.config["max_sequence_length"] * 2

        if self.config["concat_history"]:
            # concatenated_history has similar structure as history, except it contains
            # concatenated QA pairs from previous rounds
            concatenated_history = []
            concatenated_history.append(caption)
            for i in range(1, len(history)):
                concatenated_history.append([])
                for j in range(i + 1):
                    concatenated_history[i].extend(history[j])

            max_history_length = self.config["max_sequence_length"] * 2 * len(history)
            history = concatenated_history

        history_lengths = [len(round_history) for round_history in history]
        maxpadded_history = torch.full(
            (len(history), max_history_length),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_history = pad_sequence(
            [torch.tensor(round_history) for round_history in history],
            batch_first=True, padding_value=self.vocabulary.PAD_INDEX
        )
        maxpadded_history[:, :padded_history.size(1)] = padded_history
        return maxpadded_history, history_lengths
예제 #9
0
class VisDialDataset(Dataset):
    """
    A full representation of VisDial v1.0 (train/val/test) dataset. According
    to the appropriate split, it returns dictionary of question, image,
    history, ground truth answer, answer options, dense annotations etc.
    """
    def __init__(
        self,
        config: Dict[str, Any],
        dialogs_jsonpath: str,
        dense_annotations_jsonpath: Optional[str] = None,
        overfit: bool = False,
        in_memory: bool = False,
        return_options: bool = True,
        add_boundary_toks: bool = False,
        sample_flag: bool = False,
    ):
        super().__init__()
        self.config = config
        self.return_options = return_options
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(dialogs_jsonpath)

        if "val" in self.split and dense_annotations_jsonpath is not None:
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        self.vocabulary = Vocabulary(config["word_counts_json"],
                                     min_count=config["vocab_min_count"])

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath,
                                                 in_memory)
        if sample_flag == False:
            self.image_ids = list(self.dialogs_reader.dialogs.keys())
        file = open('data/qt_count.json', 'r')  ##load qt information
        self.qt_file = json.loads(file.read())
        self.qt_list = list(self.qt_file.keys())
        file.close()
        for i in range(len(self.qt_list)):
            self.qt_list[i] = word_tokenize(self.qt_list[i])
        # Keep a list of image_ids as primary keys to access data.
        if sample_flag == True:
            samplefile = open('data/visdial_1.0_train_dense_sample.json', 'r')
            sample = json.loads(samplefile.read())
            samplefile.close()
            ndcg_id_list = []
            for idx in range(len(sample)):
                ndcg_id_list.append(sample[idx]['image_id'])
            self.image_ids = ndcg_id_list

        if overfit:
            self.image_ids = self.image_ids[:5]

    @property
    def split(self):
        return self.dialogs_reader.split

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, index):
        # Get image_id, which serves as a primary key for current instance.
        image_id = self.image_ids[index]
        # Get image features for this image_id using hdf reader.
        image_features = self.hdf_reader[image_id]
        image_features = torch.tensor(image_features)
        # Normalize image features at zero-th dimension (since there's no batch
        # dimension).
        if self.config["img_norm"]:
            image_features = normalize(image_features, dim=0, p=2)

        # Retrieve instance for this image_id using json reader.
        # print(image_id)
        visdial_instance = self.dialogs_reader[image_id]
        caption = visdial_instance["caption"]
        dialog = visdial_instance["dialog"]
        opt_idx = visdial_instance['opt_list']

        # Convert word tokens of caption, question, answer and answer options
        # to integers.
        qt = torch.full([10, 1], 54)  #totally 55 question types
        qt_len = torch.full([10, 1], 2)  #maxmize about 6
        caption = self.vocabulary.to_indices(caption)
        for i in range(len(dialog)):
            question_tmp = dialog[i]["question"]
            for k in range(len(self.qt_list)):
                if self.qt_list[k] == question_tmp[0:len(self.qt_list[k])]:
                    qt[i] = k
                    qt_len[i] = len(self.qt_list[k])
            dialog[i]["question"] = self.vocabulary.to_indices(
                dialog[i]["question"])
            if self.add_boundary_toks:
                dialog[i]["answer"] = self.vocabulary.to_indices(
                    [self.vocabulary.SOS_TOKEN] + dialog[i]["answer"] +
                    [self.vocabulary.EOS_TOKEN])
            else:
                dialog[i]["answer"] = self.vocabulary.to_indices(
                    dialog[i]["answer"])

            if self.return_options:
                for j in range(len(dialog[i]["answer_options"])):
                    if self.add_boundary_toks:
                        dialog[i]["answer_options"][
                            j] = self.vocabulary.to_indices(
                                [self.vocabulary.SOS_TOKEN] +
                                dialog[i]["answer_options"][j] +
                                [self.vocabulary.EOS_TOKEN])
                    else:
                        dialog[i]["answer_options"][
                            j] = self.vocabulary.to_indices(
                                dialog[i]["answer_options"][j])

        questions, question_lengths = self._pad_sequences(
            [dialog_round["question"] for dialog_round in dialog])
        history, history_lengths = self._get_history(
            caption,
            [dialog_round["question"] for dialog_round in dialog],
            [dialog_round["answer"] for dialog_round in dialog],
        )
        answers_in, answer_lengths = self._pad_sequences(
            [dialog_round["answer"][:-1] for dialog_round in dialog]
            # [dialog_round["answer"][:] for dialog_round in dialog]
        )
        answers_out, _ = self._pad_sequences(
            # [dialog_round["answer"][:] for dialog_round in dialog]
            [dialog_round["answer"][1:] for dialog_round in dialog])
        answers, _ = self._pad_sequences(
            # [dialog_round["answer"][:] for dialog_round in dialog]
            [dialog_round["answer"] for dialog_round in dialog])
        # Collect everything as tensors for ``collate_fn`` of dataloader to
        # work seamlessly questions, history, etc. are converted to
        # LongTensors, for nn.Embedding input.
        item = {}
        item['opt_idx'] = torch.tensor(opt_idx).long()
        item["img_ids"] = torch.tensor(image_id).long()
        item["img_feat"] = image_features
        item["ques"] = questions.long()
        item["qt"] = qt.long()
        item["qt_len"] = qt_len.long()
        item["ans"] = answers.long()
        item["hist"] = history.long()
        item["ans_in"] = answers_in.long()
        item["ans_out"] = answers_out.long()
        item["ques_len"] = torch.tensor(question_lengths).long()
        item["hist_len"] = torch.tensor(history_lengths).long()
        item["ans_len"] = torch.tensor(answer_lengths).long()
        item["num_rounds"] = torch.tensor(
            visdial_instance["num_rounds"]).long()

        if self.return_options:
            if self.add_boundary_toks:
                answer_options_in, answer_options_out, answer_options = [], [], []
                answer_option_lengths = []
                for dialog_round in dialog:
                    options, option_lengths = self._pad_sequences([
                        option[:-1]
                        for option in dialog_round["answer_options"]
                    ])
                    answer_options_in.append(options)

                    options, _ = self._pad_sequences([
                        option[1:] for option in dialog_round["answer_options"]
                    ])
                    answer_options_out.append(options)

                    options, _ = self._pad_sequences([
                        option[:] for option in dialog_round["answer_options"]
                    ])
                    answer_options.append(options)

                    answer_option_lengths.append(option_lengths)

                answer_options_in = torch.stack(answer_options_in, 0)
                answer_options_out = torch.stack(answer_options_out, 0)
                answer_options = torch.stack(answer_options, 0)

                item["opt"] = answer_options.long()
                item["opt_in"] = answer_options_in.long()
                item["opt_out"] = answer_options_out.long()
                item["opt_len"] = torch.tensor(answer_option_lengths).long()
            else:
                answer_options = []
                answer_option_lengths = []
                for dialog_round in dialog:
                    options, option_lengths = self._pad_sequences(
                        dialog_round["answer_options"])
                    answer_options.append(options)
                    answer_option_lengths.append(option_lengths)
                answer_options = torch.stack(answer_options, 0)

                item["opt"] = answer_options.long()
                item["opt_len"] = torch.tensor(answer_option_lengths).long()

            if "test" not in self.split:
                answer_indices = [
                    dialog_round["gt_index"] for dialog_round in dialog
                ]
                item["ans_ind"] = torch.tensor(answer_indices).long()

        # Gather dense annotations.
        if "val" in self.split:
            dense_annotations = self.annotations_reader[image_id]
            item["relevance"] = torch.tensor(
                dense_annotations["gt_relevance"]).float()
            item["round_id"] = torch.tensor(
                dense_annotations["round_id"]).long()

        return item

    def _pad_sequences(self, sequences: List[List[int]]):
        """Given tokenized sequences (either questions, answers or answer
        options, tokenized in ``__getitem__``), padding them to maximum
        specified sequence length. Return as a tensor of size
        ``(*, max_sequence_length)``.

        This method is only called in ``__getitem__``, chunked out separately
        for readability.

        Parameters
        ----------
        sequences : List[List[int]]
            List of tokenized sequences, each sequence is typically a
            List[int].

        Returns
        -------
        torch.Tensor, torch.Tensor
            Tensor of sequences padded to max length, and length of sequences
            before padding.
        """

        for i in range(len(sequences)):
            sequences[i] = sequences[i][:self.config["max_sequence_length"] -
                                        1]
        sequence_lengths = [len(sequence) for sequence in sequences]

        # Pad all sequences to max_sequence_length.
        maxpadded_sequences = torch.full(
            (len(sequences), self.config["max_sequence_length"]),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_sequences = pad_sequence(
            [torch.tensor(sequence) for sequence in sequences],
            batch_first=True,
            padding_value=self.vocabulary.PAD_INDEX,
        )
        maxpadded_sequences[:, :padded_sequences.size(1)] = padded_sequences
        return maxpadded_sequences, sequence_lengths

    def _get_history(
        self,
        caption: List[int],
        questions: List[List[int]],
        answers: List[List[int]],
    ):
        # Allow double length of caption, equivalent to a concatenated QA pair.
        caption = caption[:self.config["max_sequence_length"] * 2 - 1]

        for i in range(len(questions)):
            questions[i] = questions[i][:self.config["max_sequence_length"] -
                                        1]

        for i in range(len(answers)):
            answers[i] = answers[i][:self.config["max_sequence_length"] - 1]

        # History for first round is caption, else concatenated QA pair of
        # previous round.
        history = []
        history.append(caption)
        for question, answer in zip(questions, answers):
            history.append(question + answer + [self.vocabulary.EOS_INDEX])
        # Drop last entry from history (there's no eleventh question).
        history = history[:-1]  #切掉最后一个
        max_history_length = self.config["max_sequence_length"] * 2

        if self.config.get("concat_history", False):
            # Concatenated_history has similar structure as history, except it
            # contains concatenated QA pairs from previous rounds.
            concatenated_history = []
            concatenated_history.append(caption)
            for i in range(1, len(history)):
                concatenated_history.append([])
                for j in range(i + 1):
                    concatenated_history[i].extend(history[j])

            max_history_length = (self.config["max_sequence_length"] * 2 *
                                  len(history))
            history = concatenated_history

        history_lengths = [len(round_history) for round_history in history]
        maxpadded_history = torch.full(
            (len(history), max_history_length),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_history = pad_sequence(
            [torch.tensor(round_history) for round_history in history],
            batch_first=True,
            padding_value=self.vocabulary.PAD_INDEX,
        )
        maxpadded_history[:, :padded_history.size(1)] = padded_history
        return maxpadded_history, history_lengths
예제 #10
0
    def __init__(
            self,
            config: Dict[str, Any],
            dialogs_jsonpath: str,
            dense_annotations_jsonpath: Optional[str] = None,
            augment_dense_annotations_jsonpath: Optional[str] = None,
            use_pretrained_emb: bool = False,
            qa_emb_file_path: Optional[str] = None,  # SA: todo remove this
            hist_emb_file_path: Optional[str] = None,  # SA: todo remove this
            use_caption: bool = True,
            num_hist_turns: int = 10,
            finetune: bool = False,
            overfit: bool = False,
            in_memory: bool = False,
            num_workers: int = 1,
            return_options: bool = True,
            add_boundary_toks: bool = False):

        super().__init__()
        self.config = config

        # SA: embedding reader
        self.use_pretrained_emb = use_pretrained_emb

        self.return_options = return_options
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(
            dialogs_jsonpath,
            num_examples=(5 if overfit else None),
            num_workers=num_workers,
            use_pretrained_emb=self.use_pretrained_emb)

        self.finetune = finetune
        self.use_caption = use_caption

        # SA: embedding reader
        if self.use_pretrained_emb:
            assert qa_emb_file_path, "Did you forget to set emb file path?"
            # @todo: for now coming through argparse
            self.qa_emb_file_path = qa_emb_file_path
            self.hist_emb_file_path = hist_emb_file_path
            # hist_emb_file_path = config["hist_emb_file_path"]
            # TransformerEmbeddingsHdfReader(embedding_path, in_memory)
            # self.embedding_reader = TransformerEmbeddingsHdfReader(hist_emb_file_path,
            #                                                        in_memory)
            self.question_reader = QuesEmbeddingsHdfReader(
                qa_emb_file_path, in_memory)
            self.ans_reader = AnswerEmbeddingsHdfReader(
                qa_emb_file_path, in_memory)
            self.caption_reader = CaptionEmbeddingsHdfReader(
                qa_emb_file_path, in_memory)

            # SA: we dont pass in_memory here because history is too big
            # SA: todo this key would change
            self.hist_reader = HistEmbeddingsHdfReader(hist_emb_file_path,
                                                       hdfs_key="hist")

        # SA: if finetuning for train/val  otherwise just validation set
        if self.finetune or ("val" in self.split
                             and dense_annotations_jsonpath is not None):
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        if augment_dense_annotations_jsonpath is not None:
            self.augmented_annotations_reader = AugmentedDenseAnnotationsReader(
                augment_dense_annotations_jsonpath)
            self.use_augment_dense = True
        else:
            self.use_augment_dense = False

        self.vocabulary = Vocabulary(config["word_counts_json"],
                                     min_count=config["vocab_min_count"])

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath,
                                                 in_memory)

        # Keep a list of image_ids as primary keys to access data.
        # For finetune we use only those image id where we have dense annotations
        if self.finetune:
            self.image_ids = list(self.annotations_reader.keys)
        else:
            self.image_ids = list(self.dialogs_reader.dialogs.keys())

        if overfit:
            self.image_ids = self.image_ids[:5]
예제 #11
0
class VisDialDataset(Dataset):
    """
    A full representation of VisDial v1.0 (train/val/test) dataset. According
    to the appropriate split, it returns dictionary of question, image,
    history, ground truth answer, answer options, dense annotations etc.
    """
    def __init__(
            self,
            config: Dict[str, Any],
            dialogs_jsonpath: str,
            dense_annotations_jsonpath: Optional[str] = None,
            augment_dense_annotations_jsonpath: Optional[str] = None,
            use_pretrained_emb: bool = False,
            qa_emb_file_path: Optional[str] = None,  # SA: todo remove this
            hist_emb_file_path: Optional[str] = None,  # SA: todo remove this
            use_caption: bool = True,
            num_hist_turns: int = 10,
            finetune: bool = False,
            overfit: bool = False,
            in_memory: bool = False,
            num_workers: int = 1,
            return_options: bool = True,
            add_boundary_toks: bool = False):

        super().__init__()
        self.config = config

        # SA: embedding reader
        self.use_pretrained_emb = use_pretrained_emb

        self.return_options = return_options
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(
            dialogs_jsonpath,
            num_examples=(5 if overfit else None),
            num_workers=num_workers,
            use_pretrained_emb=self.use_pretrained_emb)

        self.finetune = finetune
        self.use_caption = use_caption

        # SA: embedding reader
        if self.use_pretrained_emb:
            assert qa_emb_file_path, "Did you forget to set emb file path?"
            # @todo: for now coming through argparse
            self.qa_emb_file_path = qa_emb_file_path
            self.hist_emb_file_path = hist_emb_file_path
            # hist_emb_file_path = config["hist_emb_file_path"]
            # TransformerEmbeddingsHdfReader(embedding_path, in_memory)
            # self.embedding_reader = TransformerEmbeddingsHdfReader(hist_emb_file_path,
            #                                                        in_memory)
            self.question_reader = QuesEmbeddingsHdfReader(
                qa_emb_file_path, in_memory)
            self.ans_reader = AnswerEmbeddingsHdfReader(
                qa_emb_file_path, in_memory)
            self.caption_reader = CaptionEmbeddingsHdfReader(
                qa_emb_file_path, in_memory)

            # SA: we dont pass in_memory here because history is too big
            # SA: todo this key would change
            self.hist_reader = HistEmbeddingsHdfReader(hist_emb_file_path,
                                                       hdfs_key="hist")

        # SA: if finetuning for train/val  otherwise just validation set
        if self.finetune or ("val" in self.split
                             and dense_annotations_jsonpath is not None):
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        if augment_dense_annotations_jsonpath is not None:
            self.augmented_annotations_reader = AugmentedDenseAnnotationsReader(
                augment_dense_annotations_jsonpath)
            self.use_augment_dense = True
        else:
            self.use_augment_dense = False

        self.vocabulary = Vocabulary(config["word_counts_json"],
                                     min_count=config["vocab_min_count"])

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath,
                                                 in_memory)

        # Keep a list of image_ids as primary keys to access data.
        # For finetune we use only those image id where we have dense annotations
        if self.finetune:
            self.image_ids = list(self.annotations_reader.keys)
        else:
            self.image_ids = list(self.dialogs_reader.dialogs.keys())

        if overfit:
            self.image_ids = self.image_ids[:5]

    @property
    def split(self):
        return self.dialogs_reader.split

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, index):
        # start = time.time()
        # Get image_id, which serves as a primary key for current instance.
        image_id = self.image_ids[index]

        # Get image features for this image_id using hdf reader.
        image_features = self.hdf_reader[image_id]
        image_features = torch.tensor(image_features)
        # Normalize image features at zero-th dimension (since there's no batch
        # dimension).
        if self.config["img_norm"]:
            image_features = normalize(image_features, dim=0, p=2)

        # Retrieve instance for this image_id using json reader.
        visdial_instance = self.dialogs_reader[image_id]
        caption = visdial_instance["caption"]
        dialog = visdial_instance["dialog"]

        # SA: reading embeddings here
        if self.use_pretrained_emb:
            # We need indexes to actually call the readers here now.
            dialog_with_index = visdial_instance["dialog_with_index"]
            original_index = visdial_instance["original_index"]
            assert len(dialog) == len(
                dialog_with_index
            ), "These should be equal => just saving the index instead of string"

        # ideally should be in if-else clause
        ques_embeddings = []
        ans_embeddings = []
        opts_embeddings = []

        # Convert word tokens of caption, question, answer and answer options
        # to integers.
        caption = self.vocabulary.to_indices(caption)
        for i in range(len(dialog)):

            # SA: using embeddings here in the same loop
            if self.use_pretrained_emb:

                # SA: todo We dont need caption embeddings when we already have history???
                # caption_embedding = self.caption_reader[original_index]
                ques_embeddings.append(
                    self.question_reader[dialog_with_index[i]["question"]])
                ans_embeddings.append(
                    self.ans_reader[dialog_with_index[i]["answer"]])

            # SA: original code
            dialog[i]["question"] = self.vocabulary.to_indices(
                dialog[i]["question"])
            if self.add_boundary_toks:
                dialog[i]["answer"] = self.vocabulary.to_indices(
                    [self.vocabulary.SOS_TOKEN] + dialog[i]["answer"] +
                    [self.vocabulary.EOS_TOKEN])
            else:
                dialog[i]["answer"] = self.vocabulary.to_indices(
                    dialog[i]["answer"])

            # for disc decoder
            if self.return_options:
                # Ideally should be in if-else clause
                opts_round_embeddings = []

                for j in range(len(dialog[i]["answer_options"])):

                    # SA: trying option encodings here now
                    if self.use_pretrained_emb:
                        opts_round_embeddings.append(self.ans_reader[
                            dialog_with_index[i]["answer_options"][j]])

                    if self.add_boundary_toks:
                        dialog[i]["answer_options"][
                            j] = self.vocabulary.to_indices(
                                [self.vocabulary.SOS_TOKEN] +
                                dialog[i]["answer_options"][j] +
                                [self.vocabulary.EOS_TOKEN])
                    else:
                        dialog[i]["answer_options"][
                            j] = self.vocabulary.to_indices(
                                dialog[i]["answer_options"][j])

                # Ideally should be in if-else clause
                opts_embeddings.append(opts_round_embeddings)

        questions, question_lengths = self._pad_sequences(
            [dialog_round["question"] for dialog_round in dialog])
        history, history_lengths = self._get_history(
            caption,
            [dialog_round["question"] for dialog_round in dialog],
            [dialog_round["answer"] for dialog_round in dialog],
        )
        answers_in, answer_lengths = self._pad_sequences(
            [dialog_round["answer"][:-1] for dialog_round in dialog])
        answers_out, _ = self._pad_sequences(
            [dialog_round["answer"][1:] for dialog_round in dialog])

        # Collect everything as tensors for ``collate_fn`` of dataloader to
        # work seamlessly questions, history, etc. are converted to
        # LongTensors, for nn.Embedding input.
        item = {}
        item["img_ids"] = torch.tensor(image_id).long()
        item["img_feat"] = image_features
        item["ques"] = questions.long()
        item["hist"] = history.long()
        item["ans_in"] = answers_in.long(
        )  # SA: probably useful for training gen
        item["ans_out"] = answers_out.long(
        )  # SA: probably useful for training gen
        item["ques_len"] = torch.tensor(question_lengths).long()
        item["hist_len"] = torch.tensor(history_lengths).long()
        item["ans_len"] = torch.tensor(answer_lengths).long()
        item["num_rounds"] = torch.tensor(
            visdial_instance["num_rounds"]).long()

        ## SA: pretrained embedding here
        if self.use_pretrained_emb:

            # See https://github.com/pytorch/pytorch/issues/13918
            item["ques_embeddings"] = torch.tensor(
                np.array(ques_embeddings)).float()
            # now (10, 20, 768) ==> will be (bs, 10, 20, 768) (bert embeddings)
            item["opts_embeddings"] = torch.tensor(
                np.array(opts_embeddings)).float()
            # ans_embeddings = torch.tensor(np.array(ans_embeddings)).float()
            # caption_embedding = torch.tensor(np.array(caption_embedding)).float()

            # SA: todo proxy hist embeddings
            # hist_embeddings = self._get_history_embedding(caption_embedding, item["ques_embeddings"],
            #                                               ans_embeddings)
            item["hist_embeddings"] = self.hist_reader[image_id]
            # (10, 100, 20, 768) ==> will be (bs, 10, 100, 20, 768) (bert embeddings)

        if self.return_options:
            if self.add_boundary_toks:
                answer_options_in, answer_options_out = [], []
                answer_option_lengths = []
                for dialog_round in dialog:
                    options, option_lengths = self._pad_sequences([
                        option[:-1]
                        for option in dialog_round["answer_options"]
                    ])
                    answer_options_in.append(options)

                    options, _ = self._pad_sequences([
                        option[1:] for option in dialog_round["answer_options"]
                    ])
                    answer_options_out.append(options)

                    answer_option_lengths.append(option_lengths)
                answer_options_in = torch.stack(answer_options_in, 0)
                answer_options_out = torch.stack(answer_options_out, 0)

                item["opt_in"] = answer_options_in.long()
                item["opt_out"] = answer_options_out.long()
                item["opt_len"] = torch.tensor(answer_option_lengths).long()
            else:
                answer_options = []
                answer_option_lengths = []
                for dialog_round in dialog:
                    options, option_lengths = self._pad_sequences(
                        dialog_round["answer_options"])
                    answer_options.append(options)
                    answer_option_lengths.append(option_lengths)
                answer_options = torch.stack(answer_options, 0)

                # used by disc model
                ## options_length SA: used by model to select non-zero options
                item["opt"] = answer_options.long()
                item["opt_len"] = torch.tensor(answer_option_lengths).long()

            if "test" not in self.split:
                answer_indices = [
                    dialog_round["gt_index"] for dialog_round in dialog
                ]
                item["ans_ind"] = torch.tensor(
                    answer_indices).long()  # Used by evaluate for ndcg

        # Gather dense annotations.
        if self.finetune or ("val" in self.split):
            dense_annotations = self.annotations_reader[image_id]

            # SA: have to do this because of changed dic key in train
            if "val" in self.split:
                item["gt_relevance"] = torch.tensor(
                    dense_annotations["gt_relevance"]).float()
            elif "train" in self.split:
                item["gt_relevance"] = torch.tensor(
                    dense_annotations["relevance"]).float()

            item["round_id"] = torch.tensor(
                dense_annotations["round_id"]).long()
        # end = time.time()
        # time_taken = end - start
        # print('Time for loading item: ',time_taken)

        if self.use_augment_dense:
            augmented_dense_annotations = self.augmented_annotations_reader[
                image_id]

            item["augmented_gt_relevance"] = torch.tensor(
                augmented_dense_annotations["augmented_gt_relevance"]).float()

        return item

    def _pad_sequences(self, sequences: List[List[int]]):
        """Given tokenized sequences (either questions, answers or answer
        options, tokenized in ``__getitem__``), padding them to maximum
        specified sequence length. Return as a tensor of size
        ``(*, max_sequence_length)``.

        This method is only called in ``__getitem__``, chunked out separately
        for readability.

        Parameters
        ----------
        sequences : List[List[int]]
            List of tokenized sequences, each sequence is typically a
            List[int].

        Returns
        -------
        torch.Tensor, torch.Tensor
            Tensor of sequences padded to max length, and length of sequences
            before padding.
        """

        for i in range(len(sequences)):
            sequences[i] = sequences[i][:self.config["max_sequence_length"] -
                                        1]
        sequence_lengths = [len(sequence) for sequence in sequences]

        # Pad all sequences to max_sequence_length.
        maxpadded_sequences = torch.full(
            (len(sequences), self.config["max_sequence_length"]),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_sequences = pad_sequence(
            [torch.tensor(sequence) for sequence in sequences],
            batch_first=True,
            padding_value=self.vocabulary.PAD_INDEX,
        )
        maxpadded_sequences[:, :padded_sequences.size(1)] = padded_sequences
        return maxpadded_sequences, sequence_lengths

    def _get_history(
        self,
        caption: List[int],
        questions: List[List[int]],
        answers: List[List[int]],
    ):
        # Allow double length of caption, equivalent to a concatenated QA pair.
        caption = caption[:self.config["max_sequence_length"] * 2 - 1]

        for i in range(len(questions)):
            questions[i] = questions[i][:self.config["max_sequence_length"] -
                                        1]

        for i in range(len(answers)):
            answers[i] = answers[i][:self.config["max_sequence_length"] - 1]

        # History for first round is caption, else concatenated QA pair of
        # previous round.
        history = []
        ## SA: appending EOS after caption
        caption = caption + [self.vocabulary.EOS_INDEX]
        if self.use_caption:
            history.append(caption)
        else:
            history.append([self.vocabulary.EOS_INDEX])
            # print("Not using caption in history.")
        for question, answer in zip(questions, answers):
            history.append(question + answer + [self.vocabulary.EOS_INDEX])
        # Drop last entry from history (there's no eleventh question).
        history = history[:-1]
        max_history_length = self.config["max_sequence_length"] * 2

        if self.config.get("concat_history", False):
            # Concatenated_history has similar structure as history, except it
            # contains concatenated QA pairs from previous rounds.
            concatenated_history = []
            concatenated_history.append(caption)
            for i in range(1, len(history)):
                concatenated_history.append([])
                for j in range(i + 1):
                    concatenated_history[i].extend(history[j])

            max_history_length = (self.config["max_sequence_length"] * 2 *
                                  len(history))
            history = concatenated_history

        history_lengths = [len(round_history) for round_history in history]
        maxpadded_history = torch.full(
            (len(history), max_history_length),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_history = pad_sequence(
            [torch.tensor(round_history) for round_history in history],
            batch_first=True,
            padding_value=self.vocabulary.PAD_INDEX,
        )
        maxpadded_history[:, :padded_history.size(1)] = padded_history
        return maxpadded_history, history_lengths

    def _get_history_embedding(self, caption, questions, answers):
        """
        only for one dialogue here
        num_rounds = 10

        :param caption: (40, 768) ==> cross check
        :param questions: (10, 20, 768)
        :param answers: (10, 20, 768)
        :return:
        """

        concatenated_qa_history = torch.cat([questions, answers], 1)
        # print(concatenated_qa_history.size())
        # Drop last
        concatenated_qa_history = concatenated_qa_history[:-1]
        caption = caption.unsqueeze(0)
        # Concatenate along batch now
        concatenated_qa_history = torch.cat([caption, concatenated_qa_history],
                                            0)  # shape (10, 40, 768)

        if self.config.get("concat_history", False):
            max_history_length = (self.config["max_sequence_length"] * 2 *
                                  len(concatenated_qa_history))  # 400

            history_list = []
            num_rounds, _, rep_size = concatenated_qa_history.size(
            )  # (10, 40, 768)

            # hist_tensor = concatenated_qa_history.view(-1, rep_size)  # (10*40, 768)
            # hist_tensor = hist_tensor.unsqueeze(0).repeat(num_rounds,1,1)  # (10, 400, 768)
            # zero_array =

            for i in range(1, num_rounds + 1):

                pad_array = torch.zeros(
                    max_history_length -
                    self.config["max_sequence_length"] * 2 * (i), rep_size)

                hist_array = concatenated_qa_history[:i].view(-1, rep_size)

                hist_round = torch.cat([hist_array, pad_array], 0)
                history_list.append(hist_round)

            history = torch.stack(history_list, 0)
        else:
            history = concatenated_qa_history

        return history

    def _get_combined_ques_caption_or_hist(self, caption: List[int],
                                           questions: List[List[int]],
                                           answers: List[List[int]]):
        # Allow double length of caption, equivalent to a concatenated QA pair.
        caption = caption[:self.config["max_sequence_length"] * 2 - 1]

        for i in range(len(questions)):
            questions[i] = questions[i][:self.config["max_sequence_length"] -
                                        1]

        for i in range(len(answers)):
            answers[i] = answers[i][:self.config["max_sequence_length"] - 1]
예제 #12
0
                                 args.val_dense_json,
                                 overfit=args.overfit,
                                 in_memory=args.in_memory)
else:
    val_dataset = VisDialDataset(config["dataset"],
                                 args.test_json,
                                 caption_jsonpath=args.captions_test_json,
                                 overfit=args.overfit,
                                 in_memory=args.in_memory)
val_dataloader = DataLoader(val_dataset,
                            batch_size=config["solver"]["batch_size"],
                            num_workers=args.cpu_workers)

with open(config["dataset"]["glovepath"], "r") as glove_file:
    glove = json.load(glove_file)
glovevocabulary = Vocabulary(config["dataset"]["word_counts_json"],
                             min_count=config["dataset"]["vocab_min_count"])
KAT = []
for key in glove.keys():
    keylist = [key]
    token = glovevocabulary.to_indices(keylist)
    key_and_token = keylist + token
    KAT.append(key_and_token)
glove_token = {}
for item in KAT:
    glove_token[item[1]] = glove[item[0]]

glove_list = []
for i in range(len(glovevocabulary)):
    if i in glove_token.keys():
        glove_list.append(glove_token[i])
    else:
def test_encoder():

    img_feat = torch.randn(4, 36, 2048)
    seq_size = 20
    ques = torch.randperm(seq_size).view(1, seq_size)  # (batch,seq_len)
    ques = ques.unsqueeze(1).repeat(4, 10, 1)
    # ques = ques.repeat(4,1)
    ques_len = torch.LongTensor([6, 5, 4, 3]).unsqueeze(1).repeat(1, 10)
    # print(ques_len.size())
    #
    # print(ques.size())  # (4,10,20)
    # print(img_feat.size())

    config = {
        "use_hist": False,
        "use_bert": False,
        "img_feature_size": 2048,
        "word_embedding_size": 300,
        "bert_embedding_size": 768,
        "lstm_hidden_size": 512,
        "lstm_num_layers": 2,
        "dropout": 0.5,
        "word_counts_json":
        '/scratch/shubham/visdial2019/data/visdial_1.0_word_counts_train.json',
        "concat_history": False,
        "vocab_min_count": 5
    }

    vocabulary = Vocabulary(config["word_counts_json"],
                            min_count=config["vocab_min_count"])

    # net = MCANConcatHistBeforeImgEncoder(config, vocabulary)
    # opts = {
    #     'img_feat': img_feat,
    #     'ques': ques,
    #     'ques_len': ques_len
    # }
    #
    # fused_embedding = net(opts)
    # print(fused_embedding.size())

    # With history, not concatenated

    print("With history concat false")
    config["use_hist"] = True
    net = MCANImgMCANVQAHistAttnEncoder(config, vocabulary)

    seq_size = 400
    hist = torch.randperm(seq_size).view(1, seq_size)  # (batch,seq_len)
    hist = hist.unsqueeze(1).repeat(4, 10, 1)
    hist_len = torch.LongTensor([10, 15, 15, 19]).unsqueeze(1).repeat(1, 10)
    # hist_len = torch.LongTensor([20,54,43,32]).unsqueeze(1).repeat(1,10)

    opts = {
        'img_feat': img_feat,
        'ques': ques,
        'ques_len': ques_len,
        'hist': hist,
        'hist_len': hist_len
    }

    fused_embedding = net(opts)
    print(fused_embedding.size())
예제 #14
0
class VisDialDataset(Dataset):
    """
    A full representation of VisDial v1.0 (train/val/test) dataset. According to the appropriate
    split, it returns dictionary of question, image, history, ground truth answer, answer options,
    dense annotations etc.        
    """
    def __init__(self,
                 config: Dict[str, Any],
                 dialogs_jsonpath: str,
                 caption_jsonpath: str,
                 dense_annotations_jsonpath: Optional[str] = None,
                 overfit: bool = False,
                 in_memory: bool = False):
        super().__init__()
        self.config = config
        self.dialogs_reader = DialogsReader(dialogs_jsonpath)

        if "val" in self.split and dense_annotations_jsonpath is not None:
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        self.vocabulary = Vocabulary(config["word_counts_json"],
                                     min_count=config["vocab_min_count"])

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath,
                                                 in_memory)

        # Keep a list of image_ids as primary keys to access data.
        self.image_ids = list(self.dialogs_reader.dialogs.keys())
        if overfit:
            self.image_ids = self.image_ids[:5]

        self.captions_reader = CaptionReader(caption_jsonpath)

    @property
    def split(self):
        return self.dialogs_reader.split

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, index):
        # Get image_id, which serves as a primary key for current instance.
        image_id = self.image_ids[index]

        # Get image features for this image_id using hdf reader.
        image_features, image_relation = self.hdf_reader[image_id]
        image_features = torch.tensor(image_features)
        image_relation = torch.tensor(image_relation)  #relation
        # Normalize image features at zero-th dimension (since there's no batch dimension).
        if self.config["img_norm"]:
            image_features = normalize(image_features, dim=0, p=2)

        # Retrieve instance for this image_id using json reader.
        visdial_instance = self.dialogs_reader[image_id]
        caption = visdial_instance["caption"]
        dialog = visdial_instance["dialog"]

        # Convert word tokens of caption, question, answer and answer options to integers.
        caption = self.vocabulary.to_indices(caption)
        for i in range(len(dialog)):
            dialog[i]["question"] = self.vocabulary.to_indices(
                dialog[i]["question"])
            dialog[i]["answer"] = self.vocabulary.to_indices(
                dialog[i]["answer"])

            for j in range(len(dialog[i]["answer_options"])):
                dialog[i]["answer_options"][j] = self.vocabulary.to_indices(
                    dialog[i]["answer_options"][j])

        questions, question_lengths = self._pad_sequences(
            [dialog_round["question"] for dialog_round in dialog])
        history, history_lengths = self._get_history(
            caption, [dialog_round["question"] for dialog_round in dialog],
            [dialog_round["answer"] for dialog_round in dialog])

        answer_options = []
        answer_option_lengths = []
        for dialog_round in dialog:
            options, option_lengths = self._pad_sequences(
                dialog_round["answer_options"])
            answer_options.append(options)
            answer_option_lengths.append(option_lengths)
        answer_options = torch.stack(answer_options, 0)

        if "test" not in self.split:
            answer_indices = [
                dialog_round["gt_index"] for dialog_round in dialog
            ]

        captions_dic = self.captions_reader[image_id]
        captions_mul = captions_dic["captions"]
        captions_new = []
        for i in range(len(captions_mul)):
            captions_each = self.vocabulary.to_indices(captions_mul[i])
            captions_new.append(captions_each)

        captions_new, captions_len = self._pad_captions(captions_new)

        # Collect everything as tensors for ``collate_fn`` of dataloader to work seemlessly
        # questions, history, etc. are converted to LongTensors, for nn.Embedding input.
        item = {}
        item["img_ids"] = torch.tensor(image_id).long()
        item["img_feat"] = image_features
        item["relations"] = image_relation
        item["ques"] = questions.long()
        item["hist"] = history.long()
        item["opt"] = answer_options.long()
        item["ques_len"] = torch.tensor(question_lengths).long()
        item["hist_len"] = torch.tensor(history_lengths).long()
        item["opt_len"] = torch.tensor(answer_option_lengths).long()
        item["num_rounds"] = torch.tensor(
            visdial_instance["num_rounds"]).long()
        if "test" not in self.split:
            item["ans_ind"] = torch.tensor(answer_indices).long()

        # Gather dense annotations.
        if "val" in self.split:
            dense_annotations = self.annotations_reader[image_id]
            item["gt_relevance"] = torch.tensor(
                dense_annotations["gt_relevance"]).float()
            item["round_id"] = torch.tensor(
                dense_annotations["round_id"]).long()

        #关于caption的相关参数
        item["captions_len"] = torch.tensor(captions_len).long()
        item["captions"] = captions_new.long()

        return item

    def _pad_sequences(self, sequences: List[List[int]]):
        """Given tokenized sequences (either questions, answers or answer options, tokenized
        in ``__getitem__``), padding them to maximum specified sequence length. Return as a
        tensor of size ``(*, max_sequence_length)``.

        This method is only called in ``__getitem__``, chunked out separately for readability.

        Parameters
        ----------
        sequences : List[List[int]]
            List of tokenized sequences, each sequence is typically a List[int].

        Returns
        -------
        torch.Tensor, torch.Tensor
            Tensor of sequences padded to max length, and length of sequences before padding.
        """

        for i in range(len(sequences)):
            sequences[i] = sequences[i][:self.config["max_sequence_length"] -
                                        1]
        sequence_lengths = [len(sequence) for sequence in sequences]

        # Pad all sequences to max_sequence_length.
        maxpadded_sequences = torch.full(
            (len(sequences), self.config["max_sequence_length"]),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_sequences = pad_sequence(
            [torch.tensor(sequence) for sequence in sequences],
            batch_first=True,
            padding_value=self.vocabulary.PAD_INDEX)
        maxpadded_sequences[:, :padded_sequences.size(1)] = padded_sequences
        return maxpadded_sequences, sequence_lengths

    def _get_history(self, caption: List[int], questions: List[List[int]],
                     answers: List[List[int]]):
        # Allow double length of caption, equivalent to a concatenated QA pair.
        caption = caption[:self.config["max_sequence_length"] * 2 - 1]

        for i in range(len(questions)):
            questions[i] = questions[i][:self.config["max_sequence_length"] -
                                        1]

        for i in range(len(answers)):
            answers[i] = answers[i][:self.config["max_sequence_length"] - 1]

        # History for first round is caption, else concatenated QA pair of previous round.
        history = []
        history.append(caption)
        for question, answer in zip(questions, answers):
            history.append(question + answer + [self.vocabulary.EOS_INDEX])
        # Drop last entry from history (there's no eleventh question).
        history = history[:-1]
        max_history_length = self.config["max_sequence_length"] * 2

        if self.config.get("concat_history", False):
            # Concatenated_history has similar structure as history, except it contains
            # concatenated QA pairs from previous rounds.
            concatenated_history = []
            concatenated_history.append(caption)
            for i in range(1, len(history)):
                concatenated_history.append([])
                for j in range(i + 1):
                    concatenated_history[i].extend(history[j])

            max_history_length = self.config["max_sequence_length"] * 2 * len(
                history)
            history = concatenated_history

        history_lengths = [len(round_history) for round_history in history]
        maxpadded_history = torch.full(
            (len(history), max_history_length),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_history = pad_sequence(
            [torch.tensor(round_history) for round_history in history],
            batch_first=True,
            padding_value=self.vocabulary.PAD_INDEX)
        maxpadded_history[:, :padded_history.size(1)] = padded_history
        return maxpadded_history, history_lengths

    def _pad_captions(self, sequences: List[List[int]]):

        LEN_S = len(sequences)
        if LEN_S > self.config["caption_round_num"]:
            for i in range(LEN_S - self.config["caption_round_num"]):
                sequences.pop(-1)
                #caption_len.pop(-1)

        caption_len = []
        for i in range(len(sequences)):
            LEN = len(sequences[i])
            if LEN < self.config["caption_maxlen_each"]:
                caption_len.append(len(sequences[i]))
                for j in range(self.config["caption_maxlen_each"] - LEN):
                    sequences[i].append(0)
            elif LEN > self.config["caption_maxlen_each"]:
                for j in range(LEN - self.config["caption_maxlen_each"]):
                    sequences[i].pop(-1)
                caption_len.append(len(sequences[i]))
            else:
                caption_len.append(len(sequences[i]))

        LEN_S = len(sequences)
        if LEN_S < self.config["caption_round_num"]:
            j = 0
            #LENS = len(sequences)
            for i in range(self.config["caption_round_num"] - LEN_S):
                if j >= LEN_S - 1:
                    j = 0
                else:
                    j += 1
                sequences.append(sequences[j])
                length_new = caption_len[j]
                caption_len.append(length_new)

        sequences = torch.tensor(sequences).view(
            self.config["caption_round_num"],
            self.config["caption_maxlen_each"])

        return sequences, caption_len
예제 #15
0
    def __init__(
        self,
        config: Dict[str, Any],
        dialogs_jsonpath: str,
        dense_annotations_jsonpath: str,
        fold_split: Optional[str] = 'train',
        fold: int = -1,
        return_adjusted_gt_relevance: bool = False,
        overfit: bool = False,
        in_memory: bool = False,
        return_options: bool = True,
        add_boundary_toks: bool = False,
    ):
        super().__init__()
        self.config = config
        self.return_options = return_options
        self.return_adjusted_gt_relevance = return_adjusted_gt_relevance
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(dialogs_jsonpath)
        self.annotations_reader = DenseAnnotationsReader(
            dense_annotations_jsonpath
        )
        self.fold_split = fold_split
        self.fold = fold
        self.n_folds = config['n_folds']

        if config['word_embedding_type'] == 'glove':
           self.vocabulary = GloveVocabulary(
                word_counts_path = config['word_counts_json'], 
                min_count=config['vocab_min_count'], 
                glove_weight_path=config['glove_weight_txt'], 
                vec_size=config['glove_emb_dim'], 
                glove_vec_num=config['glove_vec_num']
           )
        else:
           self.vocabulary = Vocabulary(
              word_counts_path=config["word_counts_json"], 
              min_count=config["vocab_min_count"]
           )

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if ("val" in self.dialogs_reader.split 
            and "fake" not in self.dialogs_reader.split):
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(
            image_features_hdfpath, in_memory
        )

        # Keep a list of image_ids as primary keys to access data.
        all_image_ids = np.array(list(self.dialogs_reader.dialogs.keys()))
        if fold < 0 or fold_split == 'test':
            self.image_ids = all_image_ids.tolist()
        else:
            kf = KFold(n_splits=self.n_folds, shuffle=False, random_state=606)
            train_index, val_index = list(kf.split(all_image_ids))[fold]
            if fold_split == 'train':
                self.image_ids = all_image_ids[train_index].tolist()
            elif fold_split == 'val':
                self.image_ids = all_image_ids[val_index].tolist()
            else:
                raise NotImplementedError()
        
        if overfit:
            self.image_ids = self.image_ids[:5]
예제 #16
0
class KFoldVisDialDataset(Dataset):
    """
    A full representation of VisDial v1.0 (train/val/test) dataset. According
    to the appropriate split, it returns dictionary of question, image,
    history, ground truth answer, answer options, dense annotations etc.
    """

    def __init__(
        self,
        config: Dict[str, Any],
        dialogs_jsonpath: str,
        dense_annotations_jsonpath: str,
        fold_split: Optional[str] = 'train',
        fold: int = -1,
        return_adjusted_gt_relevance: bool = False,
        overfit: bool = False,
        in_memory: bool = False,
        return_options: bool = True,
        add_boundary_toks: bool = False,
    ):
        super().__init__()
        self.config = config
        self.return_options = return_options
        self.return_adjusted_gt_relevance = return_adjusted_gt_relevance
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(dialogs_jsonpath)
        self.annotations_reader = DenseAnnotationsReader(
            dense_annotations_jsonpath
        )
        self.fold_split = fold_split
        self.fold = fold
        self.n_folds = config['n_folds']

        if config['word_embedding_type'] == 'glove':
           self.vocabulary = GloveVocabulary(
                word_counts_path = config['word_counts_json'], 
                min_count=config['vocab_min_count'], 
                glove_weight_path=config['glove_weight_txt'], 
                vec_size=config['glove_emb_dim'], 
                glove_vec_num=config['glove_vec_num']
           )
        else:
           self.vocabulary = Vocabulary(
              word_counts_path=config["word_counts_json"], 
              min_count=config["vocab_min_count"]
           )

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if ("val" in self.dialogs_reader.split 
            and "fake" not in self.dialogs_reader.split):
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(
            image_features_hdfpath, in_memory
        )

        # Keep a list of image_ids as primary keys to access data.
        all_image_ids = np.array(list(self.dialogs_reader.dialogs.keys()))
        if fold < 0 or fold_split == 'test':
            self.image_ids = all_image_ids.tolist()
        else:
            kf = KFold(n_splits=self.n_folds, shuffle=False, random_state=606)
            train_index, val_index = list(kf.split(all_image_ids))[fold]
            if fold_split == 'train':
                self.image_ids = all_image_ids[train_index].tolist()
            elif fold_split == 'val':
                self.image_ids = all_image_ids[val_index].tolist()
            else:
                raise NotImplementedError()
        
        if overfit:
            self.image_ids = self.image_ids[:5]

    @property
    def split(self):
        return self.dialogs_reader.split

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, index):
        # Get image_id, which serves as a primary key for current instance.
        image_id = self.image_ids[index]

        # Get image features for this image_id using hdf reader.
        image_features = self.hdf_reader[image_id]
        image_features = torch.tensor(image_features)
        # Normalize image features at zero-th dimension (since there's no batch
        # dimension).
        if self.config["img_norm"]:
            image_features = normalize(image_features, dim=0, p=2)

        # Retrieve instance for this image_id using json reader.
        visdial_instance = self.dialogs_reader[image_id]
        caption = visdial_instance["caption"]
        dialog = visdial_instance["dialog"]

        # Convert word tokens of caption, question, answer and answer options
        # to integers.
        caption = self.vocabulary.to_indices(caption)
        for i in range(len(dialog)):
            dialog[i]["question"] = self.vocabulary.to_indices(
                dialog[i]["question"]
            )
            if self.add_boundary_toks:
                dialog[i]["answer"] = self.vocabulary.to_indices(
                    [self.vocabulary.SOS_TOKEN]
                    + dialog[i]["answer"]
                    + [self.vocabulary.EOS_TOKEN]
                )
            else:
                dialog[i]["answer"] = self.vocabulary.to_indices(
                    dialog[i]["answer"]
                )

            if self.return_options:
                for j in range(len(dialog[i]["answer_options"])):
                    if self.add_boundary_toks:
                        dialog[i]["answer_options"][
                            j
                        ] = self.vocabulary.to_indices(
                            [self.vocabulary.SOS_TOKEN]
                            + dialog[i]["answer_options"][j]
                            + [self.vocabulary.EOS_TOKEN]
                        )
                    else:
                        dialog[i]["answer_options"][
                            j
                        ] = self.vocabulary.to_indices(
                            dialog[i]["answer_options"][j]
                        )

        questions, question_lengths = self._pad_sequences(
            [dialog_round["question"] for dialog_round in dialog]
        )
        history, history_lengths = self._get_history(
            caption,
            [dialog_round["question"] for dialog_round in dialog],
            [dialog_round["answer"] for dialog_round in dialog],
        )
        answers_in, answer_lengths = self._pad_sequences(
            [dialog_round["answer"][:-1] for dialog_round in dialog]
        )
        answers_out, _ = self._pad_sequences(
            [dialog_round["answer"][1:] for dialog_round in dialog]
        )

        # Collect everything as tensors for ``collate_fn`` of dataloader to
        # work seamlessly questions, history, etc. are converted to
        # LongTensors, for nn.Embedding input.
        item = {}
        item["img_ids"] = torch.tensor(image_id).long()
        item["img_feat"] = image_features
        item["ques"] = questions.long()
        item["hist"] = history.long()
        item["ans_in"] = answers_in.long()
        item["ans_out"] = answers_out.long()
        item["ques_len"] = torch.tensor(question_lengths).long()
        item["hist_len"] = torch.tensor(history_lengths).long()
        item["ans_len"] = torch.tensor(answer_lengths).long()
        item["num_rounds"] = torch.tensor(
            visdial_instance["num_rounds"]
        ).long()

        if self.return_options:
            if self.add_boundary_toks:
                answer_options_in, answer_options_out = [], []
                answer_option_lengths = []
                for dialog_round in dialog:
                    options, option_lengths = self._pad_sequences(
                        [
                            option[:-1]
                            for option in dialog_round["answer_options"]
                        ]
                    )
                    answer_options_in.append(options)

                    options, _ = self._pad_sequences(
                        [
                            option[1:]
                            for option in dialog_round["answer_options"]
                        ]
                    )
                    answer_options_out.append(options)

                    answer_option_lengths.append(option_lengths)
                answer_options_in = torch.stack(answer_options_in, 0)
                answer_options_out = torch.stack(answer_options_out, 0)

                item["opt_in"] = answer_options_in.long()
                item["opt_out"] = answer_options_out.long()
                item["opt_len"] = torch.tensor(answer_option_lengths).long()
            else:
                answer_options = []
                answer_option_lengths = []
                for dialog_round in dialog:
                    options, option_lengths = self._pad_sequences(
                        dialog_round["answer_options"]
                    )
                    answer_options.append(options)
                    answer_option_lengths.append(option_lengths)
                answer_options = torch.stack(answer_options, 0)

                item["opt"] = answer_options.long()
                item["opt_len"] = torch.tensor(answer_option_lengths).long()

            if "test" not in self.split:
                answer_indices = [
                    dialog_round["gt_index"] for dialog_round in dialog
                ]
                item["ans_ind"] = torch.tensor(answer_indices).long()

        # Gather dense annotations.
        if "val" in self.split or "dense" in self.split:
            dense_annotations = self.annotations_reader[image_id]
            item["gt_relevance"] = torch.tensor(
                dense_annotations["gt_relevance"]
            ).float()
            item["round_id"] = torch.tensor(
                dense_annotations["round_id"]
            ).long()
            if self.return_adjusted_gt_relevance:
                item['adjusted_gt_relevance'] = torch.tensor(
                    dense_annotations['adjusted_gt_relevance']
                ).float()

        return item

    def _pad_sequences(self, sequences: List[List[int]]):
        """Given tokenized sequences (either questions, answers or answer
        options, tokenized in ``__getitem__``), padding them to maximum
        specified sequence length. Return as a tensor of size
        ``(*, max_sequence_length)``.

        This method is only called in ``__getitem__``, chunked out separately
        for readability.

        Parameters
        ----------
        sequences : List[List[int]]
            List of tokenized sequences, each sequence is typically a
            List[int].

        Returns
        -------
        torch.Tensor, torch.Tensor
            Tensor of sequences padded to max length, and length of sequences
            before padding.
        """

        for i in range(len(sequences)):
            sequences[i] = sequences[i][
                : self.config["max_sequence_length"] - 1
            ]
        sequence_lengths = [len(sequence) for sequence in sequences]

        # Pad all sequences to max_sequence_length.
        maxpadded_sequences = torch.full(
            (len(sequences), self.config["max_sequence_length"]),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_sequences = pad_sequence(
            [torch.tensor(sequence) for sequence in sequences],
            batch_first=True,
            padding_value=self.vocabulary.PAD_INDEX,
        )
        maxpadded_sequences[:, : padded_sequences.size(1)] = padded_sequences
        return maxpadded_sequences, sequence_lengths

    def _get_history(
        self,
        caption: List[int],
        questions: List[List[int]],
        answers: List[List[int]],
    ):
        # Allow double length of caption, equivalent to a concatenated QA pair.
        caption = caption[: self.config["max_sequence_length"] * 2 - 1]

        for i in range(len(questions)):
            questions[i] = questions[i][
                : self.config["max_sequence_length"] - 1
            ]

        for i in range(len(answers)):
            answers[i] = answers[i][: self.config["max_sequence_length"] - 1]

        # History for first round is caption, else concatenated QA pair of
        # previous round.
        history = []
        history.append(caption)
        for question, answer in zip(questions, answers):
            history.append(question + answer + [self.vocabulary.EOS_INDEX])
        # Drop last entry from history (there's no eleventh question).
        history = history[:-1]
        max_history_length = self.config["max_sequence_length"] * 2

        if self.config.get("concat_history", False):
            # Concatenated_history has similar structure as history, except it
            # contains concatenated QA pairs from previous rounds.
            concatenated_history = []
            concatenated_history.append(caption)
            for i in range(1, len(history)):
                concatenated_history.append([])
                for j in range(i + 1):
                    concatenated_history[i].extend(history[j])

            max_history_length = (
                self.config["max_sequence_length"] * 2 * len(history)
            )
            history = concatenated_history

        history_lengths = [len(round_history) for round_history in history]
        maxpadded_history = torch.full(
            (len(history), max_history_length),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_history = pad_sequence(
            [torch.tensor(round_history) for round_history in history],
            batch_first=True,
            padding_value=self.vocabulary.PAD_INDEX,
        )
        maxpadded_history[:, : padded_history.size(1)] = padded_history
        return maxpadded_history, history_lengths