Пример #1
0
    def __init__(self,
                 config: Dict[str, Union[int, str]],
                 dialogs_jsonpath: str,
                 dense_annotations_jsonpath: Optional[str] = None,
                 overfit: bool = False,
                 in_memory: bool = False):
        super().__init__()
        self.config = config
        self.dialogs_reader = DialogsReader(dialogs_jsonpath)

        if "val" in self.split and dense_annotations_jsonpath is not None:
            self.annotations_reader = DenseAnnotationsReader(dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        self.vocabulary = Vocabulary(
            config["word_counts_json"], min_count=config["vocab_min_count"]
        )

        # initialize image features reader according to split
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory)

        # keep a list of image_ids as primary keys to access data
        self.image_ids = list(self.dialogs_reader.dialogs.keys())
        if overfit:
            self.image_ids = self.image_ids[:5]
Пример #2
0
    def __init__(
        self,
        config: Dict[str, Any],
        dialogs_jsonpath: str,
        dense_annotations_jsonpath: str,
        fold_split: Optional[str] = 'train',
        fold: int = -1,
        return_adjusted_gt_relevance: bool = False,
        overfit: bool = False,
        in_memory: bool = False,
        return_options: bool = True,
        add_boundary_toks: bool = False,
        proj_to_senq_id: bool = False,
    ):
        super().__init__()
        self.config = config
        self.return_options = return_options
        self.return_adjusted_gt_relevance = return_adjusted_gt_relevance
        self.add_boundary_toks = add_boundary_toks
        self.proj_to_senq_id = proj_to_senq_id
        self.dialogs_reader = DialogsReader(dialogs_jsonpath, use_bert=True)
        self.annotations_reader = DenseAnnotationsReader(
            dense_annotations_jsonpath)
        self.fold_split = fold_split
        self.fold = fold
        self.n_folds = config['n_folds']

        assert (config['word_embedding_type'] == 'bert')
        self.vocabulary = BertVocabulary(
            token_counts_path=config['bert_counts_json'],
            min_count=config['vocab_min_count'])

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if ("val" in self.dialogs_reader.split
                and "fake" not in self.dialogs_reader.split):
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath,
                                                 in_memory)

        # Keep a list of image_ids as primary keys to access data.
        all_image_ids = np.array(list(self.dialogs_reader.dialogs.keys()))
        if fold < 0 or fold_split == 'test':
            self.image_ids = all_image_ids.tolist()
        else:
            kf = KFold(n_splits=self.n_folds, shuffle=False, random_state=606)
            train_index, val_index = list(kf.split(all_image_ids))[fold]
            if fold_split == 'train':
                self.image_ids = all_image_ids[train_index].tolist()
            elif fold_split == 'val':
                self.image_ids = all_image_ids[val_index].tolist()
            else:
                raise NotImplementedError()

        if overfit:
            self.image_ids = self.image_ids[:5]
Пример #3
0
    def __init__(
        self,
        config: Dict[str, Any],
        dialogs_jsonpath: str,
        dense_annotations_jsonpath: Optional[str] = None,
        overfit: bool = False,
        in_memory: bool = False,
        return_options: bool = True,
        add_boundary_toks: bool = False,
        sample_flag: bool = False,
    ):
        super().__init__()
        self.config = config
        self.return_options = return_options
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(dialogs_jsonpath,config)

        if "val" in self.split and dense_annotations_jsonpath is not None:
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath
            )
        else:
            self.annotations_reader = None

        self.vocabulary = Vocabulary(
            config["word_counts_json"], min_count=config["vocab_min_count"]
        )

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(
            image_features_hdfpath, in_memory
        )
        if sample_flag == False:
            self.image_ids = list(self.dialogs_reader.dialogs.keys())

        # Keep a list of image_ids as primary keys to access data.
        if sample_flag == True:
            samplefile = open('data/visdial_1.0_train_dense_sample.json', 'r') ####for answer score sampling (fine-tune)
            sample = json.loads(samplefile.read())
            samplefile.close()
            ndcg_id_list = []
            for idx in range(len(sample)):
                ndcg_id_list.append(sample[idx]['image_id'])
            self.image_ids = ndcg_id_list


        if overfit:
            self.image_ids = self.image_ids[:5]
Пример #4
0
    def __init__(
        self,
        config: Dict[str, Any],
        dialogs_jsonpath: str,
        dense_annotations_jsonpath: Optional[str] = None,
        return_adjusted_gt_relevance: bool = False,
        overfit: bool = False,
        in_memory: bool = False,
        return_options: bool = True,
        add_boundary_toks: bool = False,
    ):
        super().__init__()
        self.config = config
        self.return_options = return_options
        self.return_adjusted_gt_relevance = return_adjusted_gt_relevance
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(dialogs_jsonpath)

        if (("val" in self.split or "dense" in self.split)
                and dense_annotations_jsonpath is not None):
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        if config['word_embedding_type'] == 'glove':
            self.vocabulary = GloveVocabulary(
                word_counts_path=config['word_counts_json'],
                min_count=config['vocab_min_count'],
                glove_weight_path=config['glove_weight_txt'],
                vec_size=config['glove_emb_dim'],
                glove_vec_num=config['glove_vec_num'])
        else:
            self.vocabulary = Vocabulary(
                word_counts_path=config["word_counts_json"],
                min_count=config["vocab_min_count"])

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if ("val" in self.dialogs_reader.split
                and "fake" not in self.dialogs_reader.split):
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath,
                                                 in_memory)

        # Keep a list of image_ids as primary keys to access data.
        self.image_ids = list(self.dialogs_reader.dialogs.keys())
        if overfit:
            self.image_ids = self.image_ids[:5]
    def __init__(
        self,
        config: Dict[str, Any],
        dialogs_jsonpath: str,
        dense_annotations_jsonpath: Optional[str] = None,
        overfit: bool = False,
        in_memory: bool = False,
        num_workers: int = 1,
        return_options: bool = True,
        add_boundary_toks: bool = False,
    ):
        super().__init__()
        self.config = config
        self.return_options = return_options
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(
            dialogs_jsonpath,
            num_examples=(5 if overfit else None),
            num_workers=num_workers)

        if "val" in self.split and dense_annotations_jsonpath is not None:
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        self.vocabulary = Vocabulary(config["word_counts_json"],
                                     min_count=config["vocab_min_count"])

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath,
                                                 in_memory)

        # Keep a list of image_ids as primary keys to access data.
        self.image_ids = list(self.dialogs_reader.dialogs.keys())
        if overfit:
            self.image_ids = self.image_ids[:5]
Пример #6
0
    def __init__(
            self,
            config: Dict[str, Any],
            dialogs_jsonpath: str,
            dense_annotations_jsonpath: Optional[str] = None,
            augment_dense_annotations_jsonpath: Optional[str] = None,
            use_pretrained_emb: bool = False,
            qa_emb_file_path: Optional[str] = None,  # SA: todo remove this
            hist_emb_file_path: Optional[str] = None,  # SA: todo remove this
            use_caption: bool = True,
            num_hist_turns: int = 10,
            finetune: bool = False,
            overfit: bool = False,
            in_memory: bool = False,
            num_workers: int = 1,
            return_options: bool = True,
            add_boundary_toks: bool = False):

        super().__init__()
        self.config = config

        # SA: embedding reader
        self.use_pretrained_emb = use_pretrained_emb

        self.return_options = return_options
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(
            dialogs_jsonpath,
            num_examples=(5 if overfit else None),
            num_workers=num_workers,
            use_pretrained_emb=self.use_pretrained_emb)

        self.finetune = finetune
        self.use_caption = use_caption

        # SA: embedding reader
        if self.use_pretrained_emb:
            assert qa_emb_file_path, "Did you forget to set emb file path?"
            # @todo: for now coming through argparse
            self.qa_emb_file_path = qa_emb_file_path
            self.hist_emb_file_path = hist_emb_file_path
            # hist_emb_file_path = config["hist_emb_file_path"]
            # TransformerEmbeddingsHdfReader(embedding_path, in_memory)
            # self.embedding_reader = TransformerEmbeddingsHdfReader(hist_emb_file_path,
            #                                                        in_memory)
            self.question_reader = QuesEmbeddingsHdfReader(
                qa_emb_file_path, in_memory)
            self.ans_reader = AnswerEmbeddingsHdfReader(
                qa_emb_file_path, in_memory)
            self.caption_reader = CaptionEmbeddingsHdfReader(
                qa_emb_file_path, in_memory)

            # SA: we dont pass in_memory here because history is too big
            # SA: todo this key would change
            self.hist_reader = HistEmbeddingsHdfReader(hist_emb_file_path,
                                                       hdfs_key="hist")

        # SA: if finetuning for train/val  otherwise just validation set
        if self.finetune or ("val" in self.split
                             and dense_annotations_jsonpath is not None):
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath)
        else:
            self.annotations_reader = None

        if augment_dense_annotations_jsonpath is not None:
            self.augmented_annotations_reader = AugmentedDenseAnnotationsReader(
                augment_dense_annotations_jsonpath)
            self.use_augment_dense = True
        else:
            self.use_augment_dense = False

        self.vocabulary = Vocabulary(config["word_counts_json"],
                                     min_count=config["vocab_min_count"])

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath,
                                                 in_memory)

        # Keep a list of image_ids as primary keys to access data.
        # For finetune we use only those image id where we have dense annotations
        if self.finetune:
            self.image_ids = list(self.annotations_reader.keys)
        else:
            self.image_ids = list(self.dialogs_reader.dialogs.keys())

        if overfit:
            self.image_ids = self.image_ids[:5]