def collate_data(self, data):

        ind = {
            expert: [x[f"{expert}_ind"] for x in data]
            for expert in self.ordered_experts
        }
        ind = {key: ensure_tensor(np.array(val)) for key, val in ind.items()}

        experts = []
        for expert in self.ordered_experts:
            if expert in {"audio", "ocr"}:
                # zero pad the variable length inputs to the shortest possible length
                pad_to = max([x[f"{expert}_sz"] for x in data])
                val = th.from_numpy(
                    np.stack([x[expert][:pad_to] for x in data], axis=0))
            else:
                val = th.from_numpy(np.vstack([x[expert] for x in data]))
            experts.append((expert, val.float()))
        experts = OrderedDict(experts)

        # Similarly, we zero pad the text to the shortest possible length
        pad_to = max([x[f"text_sz"] for x in data])
        text = th.from_numpy(np.array([x["text"][:pad_to]
                                       for x in data])).float()
        text = self.unsqueeze_text(text)

        return {"text": text, "experts": experts, "ind": ind}
 def get_retrieval_data(self):
     experts = OrderedDict(
         (expert, th.from_numpy(self.retrieval[expert]).float())
         for expert in self.ordered_experts)
     retrieval_data = {
         "text": ensure_tensor(self.text_retrieval).float(),
         "experts": experts,
         "ind": self.test_ind,
     }
     meta = {
         "query_masks": self.query_masks,
         "raw_captions": self.raw_captions_retrieval,
         "paths": self.video_path_retrieval,
     }
     return retrieval_data, meta
示例#3
0
    def get_retrieval_data(self):
        experts = OrderedDict(
            (expert, th.from_numpy(self.retrieval[expert]).float())
            for expert in self.ordered_experts
        )

        retrieval_data = {
            "text": ensure_tensor(self.text_retrieval).float(),
            "experts": experts,
            "ind": self.test_ind,
        }
        meta = {
            "query_masks": self.query_masks,
            "raw_captions": self.raw_captions_retrieval,
            "paths": self.video_path_retrieval,
        }

        if False:
            # safety checks
            import pickle
            ret1 = retrieval_data
            with open("/tmp/retrieval_data.pkl", "rb") as f:
                ret2 = pickle.load(f)

            # ind - OK
            for key, val in ret1["ind"].items():
                print(f"ind diff: {key}", (val - ret2["ind"][key].float()).sum())

            # set nans to comparable number first
            NAN_VAL = 2780343
            experts1 = dict(ret1["experts"])
            experts2 = dict(ret2["experts"])
            for key in experts1.keys():
                fix = th.isnan(experts1[key])
                experts1[key][fix] = NAN_VAL

                fix = th.isnan(experts2[key])
                experts2[key][fix] = NAN_VAL

            for key in experts1:
                print(key, (experts1[key] - experts2[key]).sum())

            # text - OK
            print("text diff", (ret2["text"] - ret1["text"]).sum())
            import ipdb; ipdb.set_trace()
        return retrieval_data, meta
    def collate_data(self, data):
        batch_size = len(data)
        tensors = {}
        for expert in self.tensor_storage["fixed"]:
            if expert in self.trn_config.keys():
                tensors[expert] = np.zeros(
                    (batch_size, self.trn_config[expert],
                     self.raw_input_dims[expert]))
            else:
                tensors[expert] = np.zeros(
                    (batch_size, self.raw_input_dims[expert]))

        # Track which indices of each modality are available in the present batch
        ind = {expert: np.zeros(batch_size) for expert in self.experts}
        tensors.update({
            expert: np.zeros((batch_size, self.max_tokens[expert],
                              self.raw_input_dims[expert]))
            for expert in self.tensor_storage["variable"]
        })

        if "retrieval" in self.task:
            text_tensor = np.zeros((batch_size, self.captions_per_video,
                                    self.max_tokens["text"], self.text_dim))
            text_token_mask = np.zeros((batch_size, self.captions_per_video))
        elif "classification" in self.task and self.class_type == "single_label":
            label_tensor = np.zeros(batch_size)
            vid_name = []
        elif "classification" in self.task and self.class_type == "multi_label":
            label_tensor = np.zeros((batch_size, self.num_classes))
            vid_name = []

        for ii, _ in enumerate(data):
            datum = data[ii]
            for expert in self.experts:
                ind[expert][ii] = datum[f"{expert}_ind"]
            for expert in self.tensor_storage["fixed"]:
                tensors[expert][ii] = datum[expert]
            for expert in self.tensor_storage["variable"]:
                if ind[expert][ii]:
                    keep = min(len(datum[expert]), self.max_tokens[expert])
                    if keep:
                        tensors[expert][ii, :keep, :] = datum[expert][:keep]
                else:
                    tensors[expert][ii, :, :] = self.MISSING_VAL

            if "retrieval" in self.task:
                text = datum["text"]
                for jj in range(self.captions_per_video):
                    keep = min(len(text[jj]), self.max_tokens["text"])
                    text_tensor[ii, jj, :keep, :] = text[jj][:keep]
                    text_token_mask[ii, jj] = keep
            elif self.task == "classification":
                if self.cls_partition != 'test':
                    label_tensor[ii] = datum["labels"]
                vid_name.append(datum["vid"])

        ind = {key: ensure_tensor(val) for key, val in ind.items()}
        experts = OrderedDict((expert, th.from_numpy(tensors[expert]).float())
                              for expert in self.ordered_experts)

        for expert in self.experts:
            if self.feat_aggregation[expert].get("binarise", False):
                replace = np.logical_not(th.isnan(experts[expert][:, 0, 0]))
                experts[expert][replace] = th.ones_like(
                    experts[expert][replace])

        minibatch = {"experts": experts, "ind": ind}
        if "retrieval" in self.task:
            minibatch["text"] = th.from_numpy(text_tensor).float()
            minibatch["text_token_mask"] = th.from_numpy(text_token_mask)
        elif self.task == "classification":
            if self.cls_partition != 'test':
                minibatch["labels"] = th.from_numpy(label_tensor).float()
            if self.cls_partition != "train":
                # we only pass the video names for visualisation and making predictions
                # on the val/test set
                minibatch["vid_name"] = vid_name
        return minibatch
示例#5
0
    def collate_data(self, data):
        batch_size = len(data)

        # Track which indices of each modality are available in the present batch
        ind = {expert: np.zeros(batch_size) for expert in self.experts}

        # as above, we handle rgb separately from other fixed_sz experts
        tensors = {expert: np.zeros((batch_size, self.raw_input_dims[expert]))
                   for expert in self.fixed_sz_experts if expert != "rgb"}
        tensors["rgb"] = np.zeros((batch_size, self.rgb_shots,
                                   self.raw_input_dims["rgb"]))
        tensors.update({expert: np.zeros(
            (batch_size, self.max_expert_tokens, self.raw_input_dims[expert])
        ) for expert in self.variable_sz_experts})

        text_tensor = np.zeros((batch_size, self.captions_per_video, self.max_text_words,
                                self.text_dim))

        for ii, _ in enumerate(data):

            datum = data[ii]
            for expert in self.experts:
                ind[expert][ii] = datum[f"{expert}_ind"]

            # It is preferable to explicitly pass NaNs into the network as missing
            # values, over simply zeros, to avoid silent failures
            for expert in self.fixed_sz_experts:
                tensors[expert][ii] = datum[expert]
            for expert in self.variable_sz_experts:
                keep = min(len(datum[expert]), self.max_expert_tokens)
                if keep:
                    tensors[expert][ii, :keep, :] = datum[expert][:keep]
            text = datum["text"]
            for jj in range(self.captions_per_video):
                keep = min(len(text[jj]), self.max_text_words)
                text_tensor[ii, jj, :keep, :] = text[jj][:keep]

        ind = {key: ensure_tensor(val) for key, val in ind.items()}
        text = th.from_numpy(text_tensor).float()
        experts = OrderedDict(
            (expert, th.from_numpy(tensors[expert]).float())
            for expert in self.ordered_experts)

        minibatch = {"text": text, "experts": experts, "ind": ind}

        # ----------------------------------------------------------------
        # sanity checking
        # ----------------------------------------------------------------
        if False:
            import pickle
            with open("/tmp/minibatch.pkl", "rb") as f:
                minibatch2 = pickle.load(f)

            # text - OK
            print("text diff", (minibatch2["text"] - minibatch["text"]).sum())

            # ind - OK
            for key, val in minibatch["ind"].items():
                print(f"ind diff: {key}", (val - minibatch2["ind"][key]).sum())

            # set nans to comparable number first
            NAN_VAL = 2780343
            experts1 = dict(minibatch["experts"])
            experts2 = dict(minibatch2["experts"])
            for key in experts1.keys():
                fix = th.isnan(experts1[key])
                experts1[key][fix] = NAN_VAL

                fix = th.isnan(experts2[key])
                experts2[key][fix] = NAN_VAL

            for key in experts1:
                print(key, (experts1[key] - experts2[key]).sum())
            import ipdb; ipdb.set_trace()
        return minibatch