def __getitem__(self, idx): # get query id and corresponding video id qid = str(self.qids[idx]) vid = self.anns[qid]["video_id"] timestamp = self.anns[qid]["timestamps"] duration = self.anns[qid]["duration"] # get query labels if self.in_memory: q_label = self.query_labels[qid] else: query_labels = h5py.File(self.paths["query_labels"], "r") q_label = query_labels[qid][:] q_leng = self.query_lengths[qid] # get grounding label if self.in_memory: start_pos = self.s_pos[qid] end_pos = self.e_pos[qid] else: grd_info = io_utils.load_hdf5(self.paths["grounding_info"], False) start_pos = grd_info["start_pos/"+qid][()] end_pos = grd_info["end_pos/"+qid][()] # get video features if self.in_memory: vid_feat_all = self.feats[vid] else: vid_feat_all = io_utils.load_hdf5(self.feat_hdf5, verbose=False)[vid]["c3d_features"] vid_feat, nfeats, start_index, end_index = self.get_fixed_length_feat( vid_feat_all, self.S, start_pos, end_pos) # get video masks vid_mask = np.zeros((self.S, 1)) vid_mask[:nfeats] = 1 # get attention mask if self.in_memory: att_mask = self.att_mask[qid] else: att_mask = grd_info["att_mask/"+qid][:] instance = { "vids": vid, "qids": qid, "timestamps": timestamp, # GT location [s, e] (seconds) "duration": duration, # video span (seconds) "query_lengths": q_leng, "query_labels": torch.LongTensor(q_label).unsqueeze(0), # [1,L_q_max] "query_masks": (torch.FloatTensor(q_label)>0).unsqueeze(0), # [1,L_q_max] "grounding_start_pos": torch.FloatTensor([start_pos]), # [1]; normalized "grounding_end_pos": torch.FloatTensor([end_pos]), # [1]; normalized "grounding_att_masks": torch.FloatTensor(att_mask), # [L_v] "nfeats": torch.FloatTensor([nfeats]), "video_feats": torch.FloatTensor(vid_feat), # [L_v,D_v] "video_masks": torch.ByteTensor(vid_mask), # [L_v,1] } return instance
def __init__(self, config): super(self.__class__, self).__init__(config) # get options self.S = config.get("num_segment", 128) self.split = config.get("split", "train") self.data_dir = config.get("data_dir", "") self.feature_type = config.get("feature_type", "C3D") self.in_memory = config.get("in_memory", False) self.feat_hdf5 = config.get( "video_feature_path", "data/ActivityNet/feats/sub_activitynet_v1-3.c3d.hdf5") # cropping augmentation settings self.cropping_augmentation = config.get("cropping_augmentation", False) self.cropping_prob = config.get("cropping_prob", 0.5) self.cropping_factor = config.get("cropping_factor", 0.5) self.no_aug = False # get paths for proposals and captions paths = self._get_data_path(config) # create labels (or load existing one) ann_path = config.get( "annotation_path", "data/ActivityNet/captions/annotations/train.json") self.anns, self.qids, self.vids = self._load_annotation(ann_path) if not self._exist_data(paths): self.generate_labels(config) # load features if use in_memory if self.in_memory: self.feats = {} h = io_utils.load_hdf5(self.feat_hdf5, verbose=False) for k in tqdm(self.vids, desc="In-Memory: vid_feat"): self.feats[k] = h[k]["c3d_features"][:] self.s_pos, self.e_pos, self.att_mask = {}, {}, {} grd_info = io_utils.load_hdf5(self.paths["grounding_info"], False) for k in tqdm(self.qids, desc="In-Memory: grounding"): self.s_pos[k] = grd_info["start_pos/" + k][()] self.e_pos[k] = grd_info["end_pos/" + k][()] self.att_mask[k] = grd_info["att_mask/" + k][()] self.query_labels = {} query_labels = h5py.File(self.paths["query_labels"], "r") for k in tqdm(self.qids, desc="In-Memory: query"): self.query_labels[k] = query_labels[k][:] # load and prepare json files query_info = io_utils.load_json(self.paths["query_info"]) self.wtoi = query_info["wtoi"] self.itow = query_info["itow"] self.query_lengths = query_info["query_lengths"] self.batch_size = config.get("batch_size", 64) self.num_instances = len(self.qids)
def __init__(self, config): super(self.__class__, self).__init__(config) # get options self.S = config.get("num_segment", 128) self.split = config.get("split", "train") self.data_dir = config.get("data_dir", "data/charades") self.feature_type = config.get("feature_type", "I3D") self.in_memory = config.get("in_memory", False) self.feat_hdf5 = config.get( "video_feature_path", "data/charades/features/i3d_finetuned/i3d_finetuned.h5") self.feat_path = config.get("video_feature_path", "data/charades/features/i3d_finetuned.h5") # get paths for proposals and captions paths = self._get_data_path(config) # create labels (or load existing one) ann_path = "data/charades/annotations/charades_sta_{}.txt".format( self.split) aux_ann_path = "data/charades/annotations/Charades_v1_{}.csv".format( self.split) self.anns, self.qids, self.vids = self._load_annotation( ann_path, aux_ann_path) if not self._exist_data(paths): self.generate_labels(config) # load features if use in_memory if self.in_memory: self.feats = {} h = io_utils.load_hdf5(self.feat_hdf5, verbose=False) for vid in tqdm(self.vids, desc="In-Memory: vid_feat"): self.feats[vid] = h[vid][()] h.close() self.s_pos, self.e_pos, self.att_mask = {}, {}, {} grd_info = io_utils.load_hdf5(self.paths["grounding_info"], False) for k in tqdm(self.qids, desc="In-Memory: grounding"): self.s_pos[k] = grd_info["start_pos/" + k][()] self.e_pos[k] = grd_info["end_pos/" + k][()] self.att_mask[k] = grd_info["att_mask/" + k][()] self.query_labels = {} query_labels = h5py.File(self.paths["query_labels"], "r") for k in tqdm(self.qids, desc="In-Memory: query"): self.query_labels[k] = query_labels[k][:] # load query information query_info = io_utils.load_json(self.paths["query_info"]) self.wtoi = query_info["wtoi"] self.itow = query_info["itow"] self.query_lengths = query_info["query_lengths"] self.batch_size = config.get("batch_size", 64) self.num_instances = len(self.qids)
def __getitem__(self, idx): """ Retrun a data (images, question_label, question length and answers) Returns: img (or feat): image (or feature) qst_label: question label qst_length: question length answer: answer for questions """ # obtain image (as raw or feature) img_filename = self.json_file["image_filenames"][idx] if self.use_img: img_path = os.path.join(self.img_dir, img_filename) img = Image.open(img_path).convert("RGB") img = self.prepro(img) else: feat_path = os.path.join(self.feat_dir, img_filename.replace(".png", ".npy")) img = np.load(feat_path) img = torch.Tensor(img) # obtain question label and its length hdf5_file = io_utils.load_hdf5(self.hdf5_path, verbose=False) qst_label = torch.from_numpy(hdf5_file["question_labels"][idx]) qst_length = hdf5_file["question_length"][idx] # obtain answer label answer = hdf5_file["answer_labels"][idx] answer = torch.from_numpy(np.asarray([answer])).long() hdf5_file.close() # obtain img info (question id) qst_id = self.json_file["question_ids"][idx] # prepare batch output out = [img, qst_label, qst_length] if self.assignment_path != "": # NOTE: DEPRECATED # obtain assignment label assignment_file = io_utils.load_hdf5(self.assignment_path, verbose=False) assignments = torch.from_numpy(assignment_file["assignments"][idx]) out.append(assignments) if self.base_logits_path != "": # obtain assignment label base_logits = io_utils.load_hdf5(self.base_logits_path, verbose=False) base_logits = torch.from_numpy(base_logits["base_logits"][idx]) out.append(base_logits) out.append(answer) if self.vis_mode: out.append(img_filename) else: out.append(qst_id) return out
def __init__(self, config): # get configions print(json.dumps(config, indent=4)) self.hdf5_path = utils.get_value_from_dict(config, "encoded_hdf5_path", \ "data/CLEVR_v1.0/preprocess/encoded_qa/vocab_train_raw/" \ + "all_questions_use_zero_token/qa_train.h5") self.json_path = utils.get_value_from_dict(config, "encoded_json_path", \ "data/CLEVR_v1.0/preprocess/encoded_qa/vocab_train_raw/" \ + "all_questions_use_zero_token/qa_train.json") self.img_size = utils.get_value_from_dict(config, "img_size", 224) self.batch_size = utils.get_value_from_dict(config, "batch_size", 32) self.use_img = utils.get_value_from_dict(config, "use_img", False) self.use_gpu = utils.get_value_from_dict(config, "use_gpu", True) if self.use_img: self.img_dir = utils.get_value_from_dict(config, "img_dir", "data/CLEVR_v1.0/images") self.prepro = trn.Compose([ trn.Resize(self.img_size), trn.CenterCrop(self.img_size), trn.ToTensor(), trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) else: self.feat_dir = utils.get_value_from_dict(config, "feat_dir", "data/CLEVR_v1.0/feats") # load hdf5 file including question_labels, question_length, # answer_labels hdf5_file = io_utils.load_hdf5(self.hdf5_path) self.max_time_steps = hdf5_file["question_labels"].shape[1] # load json file including woti, itow, atoi, itoa, splits, vocab_info, # question_ids, image_filenames self.json_file = io_utils.load_json(self.json_path) # set path of pre-computed assignments # NOTE: DEPRECATED self.assignment_path = utils.get_value_from_dict( config, "assignment_path", "") # set path of pre-computed logits of base models self.base_logits_path = utils.get_value_from_dict( config, "base_logits_path", "") self.fetching_answer_option = "simple" self.vis_mode = config.get("vis_mode", False)
def ensemble(config): """ Build data loader """ dset = dataset.DataSet(config["test_loader"]) L = data.DataLoader( \ dset, batch_size=config["test_loader"]["batch_size"], \ num_workers=config["num_workers"], \ shuffle=False, collate_fn=dataset.collate_fn) """ Load assignments if exists """ with_assignment = False if config["assignment_path"] != "None": with_assignment = True assignment_file = io_utils.load_hdf5(config["assignment_path"], verbose=False) assignments = assignment_file["assignments"][:] cnt_mapping = np.zeros((3,3)) """ Build network """ nets = [] net_configs = [] for i in range(len(config["checkpoint_paths"])): net_configs.append(io_utils.load_yaml(config["config_paths"][i])) net_configs[i] = M.override_config_from_loader(net_configs[i], dset) nets.append(M(net_configs[i])) nets[i].bring_loader_info(dset) apply_cc_after = utils.get_value_from_dict( net_configs[i]["model"], "apply_curriculum_learning_after", -1) # load checkpoint if exists nets[i].load_checkpoint(config["checkpoint_paths"][i]) start_epoch = int(utils.get_filename_from_path( config["checkpoint_paths"][i]).split("_")[-1]) # If checkpoint use curriculum learning if (apply_cc_after > 0) and (start_epoch >= apply_cc_after): nets[i].apply_curriculum_learning() # ship network to use gpu if config["use_gpu"]: for i in range(len(nets)): nets[i].gpu_mode() for i in range(len(nets)): nets[i].eval_mode() # initialize counters for different tau metrics = ["top1-avg", "top1-max", "oracle"] for i in range(len(nets)): modelname = "M{}".format(i) metrics.append(modelname) tau = [1.0, 1.2, 1.5, 2.0, 5.0, 10.0, 50.0, 100.0] counters = OrderedDict() for T in tau: tau_name = "tau-"+str(T) counters[tau_name] = OrderedDict() for mt in metrics: counters[tau_name][mt] = accumulator.Accumulator(mt) """ Run training network """ ii = 0 itoa = dset.get_itoa() predictions = [] for batch in tqdm(L): # Forward networks probs = 0 B = batch[0][0].size(0) if type(batch[0][-1]) == type(list()): gt = batch[0][-1][0] else: gt = batch[0][-1] correct = 0 probs = {} for T in tau: tau_name = "tau-"+str(T) probs[tau_name] = 0 prob_list = [] for i in range(len(nets)): outputs = nets[i].evaluate(batch) prob_list.append(outputs[1]) # m*[B,A] if config["save_logits"]: TODO = True for T in tau: tau_name = "tau-"+str(T) probs = [net_utils.get_data(F.softmax(logits/T, dim=1)) \ for logits in prob_list] # m*[B,A] # count correct numbers for each model for i in range(len(nets)): val, idx = probs[i].max(dim=1) correct = torch.eq(idx, gt) num_correct = torch.sum(correct) modelname = "M{}".format(i) counters[tau_name][modelname].add(num_correct, B) # add prob of each model if i == 0: oracle_correct = correct else: oracle_correct = oracle_correct + correct # top1-max accuracy for ensemble ens_probs, ens_idx = torch.stack(probs,0).max(0) # [B,A] max_val, max_idx = ens_probs.max(dim=1) num_correct = torch.sum(torch.eq(max_idx, gt)) counters[tau_name]["top1-max"].add(num_correct, B) # top1-avg accuracy for ensemble ens_probs = sum(probs) # [B,A] max_val, max_idx = ens_probs.max(dim=1) num_correct = torch.sum(torch.eq(max_idx, gt)) counters[tau_name]["top1-avg"].add(num_correct, B) # oracle accuracy for ensemble num_oracle_correct = torch.sum(torch.ge(oracle_correct, 1)) counters[tau_name]["oracle"].add(num_oracle_correct, B) # attach predictions for i in range(len(batch[1])): qid = batch[1][i] predictions.append({ "question_id": qid, "answer": utils.label2string(itoa, max_idx[i]) }) # epoch done # print accuracy for cnt_k,cnt_v in counters.items(): txt = cnt_k + " " for k,v in cnt_v.items(): txt += ", {} = {:.5f}".format(v.get_name(), v.get_average()) print(txt) save_dir = os.path.join("results", "ensemble_predictions") io_utils.check_and_create_dir(save_dir) io_utils.write_json(os.path.join(save_dir, config["out"]+".json"), predictions)
def __getitem__(self, idx): # get query id and corresponding video id qid = str(self.qids[idx]) vid = self.anns[qid]["video_id"] timestamp = self.anns[qid]["timestamps"] duration = self.anns[qid]["duration"] # get query labels if self.in_memory: q_label = self.query_labels[qid] else: query_labels = h5py.File(self.paths["query_labels"], "r") q_label = query_labels[qid][:] q_leng = self.query_lengths[qid] # get grounding label if self.in_memory: start_pos = self.s_pos[qid] end_pos = self.e_pos[qid] else: grd_info = io_utils.load_hdf5(self.paths["grounding_info"], False) start_pos = grd_info["start_pos/" + qid][()] end_pos = grd_info["end_pos/" + qid][()] # get video features if self.in_memory: vid_feat_all = self.feats[vid] else: vid_feat_all = np.load(self.feat_path.format(vid)).squeeze() # treat defective case if timestamp[1] > duration: duration = timestamp[1] if timestamp[0] > timestamp[1]: timestamp = [timestamp[1], timestamp[0]] # Cropping Augmentation, part 1 cropping = random() do_crop = self.cropping_augmentation and self.split == "train" and ( not self.no_aug) and (cropping < self.cropping_prob) if do_crop: # treat defective case cut_start = random() * timestamp[0] * self.cropping_factor cut_end = random() * (duration - timestamp[1]) * self.cropping_factor # modify vid_all nfeats_all = vid_feat_all.shape[0] keep = utils.timestamp_to_featstamp( [cut_start, duration - cut_end], nfeats_all, duration) vid_feat_all = vid_feat_all[keep[0]:keep[1] + 1] # modify duration, timestamp, grounding label duration = duration - cut_start - cut_end timestamp = [timestamp[0] - cut_start, timestamp[1] - cut_start] start_pos = timestamp[0] / duration end_pos = timestamp[1] / duration # Adjust video feats vid_feat, nfeats, start_index, end_index = self.get_fixed_length_feat( vid_feat_all, self.S, start_pos, end_pos) # Cropping augmentation, part 2 if do_crop: # if training, make attention mask fs = utils.timestamp_to_featstamp(timestamp, nfeats, duration) att_mask = np.zeros((self.S)) att_mask[fs[0]:fs[1] + 1] = 1 else: # if not training, get attention mask if self.in_memory: att_mask = self.att_mask[qid] else: att_mask = grd_info["att_mask/" + qid][:] # get video masks vid_mask = np.zeros((self.S, 1)) vid_mask[:nfeats] = 1 instance = { "vids": vid, "qids": qid, "timestamps": timestamp, # GT location [s, e] (second) "duration": duration, # video span (second) "query_lengths": q_leng, "query_labels": torch.LongTensor(q_label).unsqueeze(0), # [1,L_q_max] "query_masks": (torch.FloatTensor(q_label) > 0).unsqueeze(0), # [1,L_q_max] "grounding_start_pos": torch.FloatTensor([start_pos]), # [1]; normalized "grounding_end_pos": torch.FloatTensor([end_pos]), # [1]; normalized "grounding_att_masks": torch.FloatTensor(att_mask), # [L_v] "nfeats": torch.FloatTensor([nfeats]), "video_feats": torch.FloatTensor(vid_feat), # [L_v,D_v] "video_masks": torch.ByteTensor(vid_mask), # [L_v,1] } return instance
def __init__(self, config): super(self.__class__, self).__init__(config) # get options self.S = config.get("num_segment", 128) self.split = config.get("split", "train") self.data_dir = config.get("data_dir", "data/charades") self.feature_type = config.get("feature_type", "I3D") self.in_memory = config.get("in_memory", False) if self.feature_type == "I3D": self.feat_path = config.get( "video_feature_path", "data/charades/features/i3d_finetuned/{}.npy") else: raise ValueError("Wrong feature_type") self.num_captions_per_segment = 5 # number of captions per segment. for example, if there are 4 back-translated per an original sentence, it should be 5. # cropping augmentation settings self.cropping_augmentation = config.get("cropping_augmentation", False) self.cropping_prob = config.get("cropping_prob", 0.5) self.cropping_factor = config.get("cropping_factor", 0.5) self.no_aug = False # CR settings self.cr_compare_all = config.get("cr_compare_all", True) # get paths for proposals and captions paths = self._get_data_path(config) # create labels (or load existing one) ann_path = "data/charades/annotations/charades_sta_{}.txt".format( self.split) aux_ann_path = "data/charades/annotations/Charades_v1_{}.csv".format( self.split) self.anns, self.qids, self.vids = self._load_annotation( ann_path, aux_ann_path) if not self._exist_data(paths): self.generate_labels(config) # load features if use in_memory if self.in_memory: self.feats = {} for vid in tqdm(self.vids, desc="In-Memory: vid_feat"): self.feats[vid] = np.load(self.feat_path.format(vid)).squeeze() self.s_pos, self.e_pos, self.att_mask = {}, {}, {} grd_info = io_utils.load_hdf5(self.paths["grounding_info"], False) for k in tqdm(self.qids, desc="In-Memory: grounding"): self.s_pos[k] = grd_info["start_pos/" + k][()] self.e_pos[k] = grd_info["end_pos/" + k][()] self.att_mask[k] = grd_info["att_mask/" + k][()] self.query_labels = {} query_labels = h5py.File(self.paths["query_labels"], "r") for k in tqdm(self.qids, desc="In-Memory: query"): self.query_labels[k] = query_labels[k][:] # load query information query_info = io_utils.load_json(self.paths["query_info"]) self.wtoi = query_info["wtoi"] self.itow = query_info["itow"] self.query_lengths = query_info["query_lengths"] self.batch_size = config.get("batch_size", 64) if self.split == 'train': self.num_instances = len( self.qids) // self.num_captions_per_segment else: self.num_instances = len(self.qids)
def generate_labels(self, config): """ Generate and save labels for temporal language grouding 1)query_info (.json) with - wtoi: word to index dictionary (vocabulary) - itow: index to word dictionary (vocabulary) - query_lengths: lengths for queries 2)query_labels (.h5): qid -> label 3)grounding_labels (.h5): qid -> label """ """ Query information """ if not os.path.exists(self.paths["query_labels"]): # build vocabulary from training data train_ann_path = "data/ActivityNet/captions/annotations/train.json" train_anns, _, _ = self._load_annotation(train_ann_path) wtoi = self._build_vocab(train_anns) itow = {v:k for k,v in wtoi.items()} # encode query and save labels (+lenghts) L = config.get("max_length", 25) encoded = self._encode_query(self.anns, wtoi, L) query_labels = io_utils.open_hdf5( self.paths["query_labels"], "w") for qid in tqdm(encoded["query_lengths"].keys(), desc="Saving query"): _ = query_labels.create_dataset(str(qid), data=encoded["query_labels"][qid]) query_labels.close() # save vocabulary and query length query_info = { "wtoi": wtoi, "itow": itow, "query_lengths": encoded["query_lengths"], } io_utils.write_json(self.paths["query_info"], query_info) """ Grounding information """ if not os.path.exists(self.paths["grounding_info"]): if self.feature_type == "C3D": features = io_utils.load_hdf5(self.feat_hdf5) grd_dataset = io_utils.open_hdf5(self.paths["grounding_info"], "w") start_pos = grd_dataset.create_group("start_pos") end_pos = grd_dataset.create_group("end_pos") att_masks = grd_dataset.create_group("att_mask") for qid,ann in tqdm(self.anns.items(), desc="Gen. Grd. Labels"): # get starting/ending positions ts = ann["timestamps"] vid_d = ann["duration"] start = ts[0] / vid_d end = ts[1] / vid_d # get attention calibration mask vid = ann["video_id"] nfeats = features[vid]["c3d_features"][:].shape[0] nfeats = min(nfeats, self.S) fs = utils.timestamp_to_featstamp(ts, nfeats, vid_d) att_mask = np.zeros((self.S)) att_mask[fs[0]:fs[1]+1] = 1 _ = start_pos.create_dataset(qid, data=start, dtype="float") _ = end_pos.create_dataset(qid, data=end, dtype="float") _ = att_masks.create_dataset(qid, data=att_mask, dtype="float") # save the encoded proposal labels and video ids grd_dataset.close()