def get_video_ids(query_txt_db): if os.path.exists(f'{query_txt_db}/query2video.json'): q2v = load_json(f'{query_txt_db}/query2video.json') qids = load_json(f'{query_txt_db}/id2len.json').keys() video_ids = list(set([q2v[qid] for qid in qids])) else: video_ids = load_json(f'{query_txt_db}/video_ids.json') return video_ids
def combine(video_name_split_path, video_duration_path, save_path): video_name_split = load_json(video_name_split_path) video_duration_dict = load_json(video_duration_path) combined_dict = {} for split_name, split_video_names in video_name_split.items(): combined_dict[split_name] = { vid_name: video_duration_dict[vid_name] for vid_name in split_video_names } save_json(combined_dict, save_path)
def load_transform_data(data_path): data = load_json(data_path) transformed_data = [] for v_id, cap in data.items(): cap["v_id"] = v_id transformed_data.append(cap) return transformed_data
def save_training_meta(args): # Comment out, since rank is not saved to args. Safeguard save_training_meta already in training scripts. # if args.rank > 0: # return # args is an EasyDict object, treat it the same as a normal dict os.makedirs(join(args.output_dir, 'log'), exist_ok=True) os.makedirs(join(args.output_dir, 'ckpt'), exist_ok=True) # training args save_args_path = join(args.output_dir, 'log', 'hps.json') save_json(vars(args), save_args_path, save_pretty=True) # model args model_config = load_json(args.model_config) save_model_config_path = join(args.output_dir, 'log', 'model_config.json') save_json(model_config, save_model_config_path, save_pretty=True) # git info try: LOGGER.info("Waiting on git info....") c = subprocess.run(["git", "rev-parse", "--abbrev-ref", "HEAD"], timeout=10, stdout=subprocess.PIPE) git_branch_name = c.stdout.decode().strip() LOGGER.info("Git branch: %s", git_branch_name) c = subprocess.run(["git", "rev-parse", "HEAD"], timeout=10, stdout=subprocess.PIPE) git_sha = c.stdout.decode().strip() LOGGER.info("Git SHA: %s", git_sha) git_dir = abspath(dirname(__file__)) git_status = subprocess.check_output(['git', 'status', '--short'], cwd=git_dir, universal_newlines=True).strip() with open(join(args.output_dir, 'log', 'git_info.json'), 'w') as writer: json.dump( { 'branch': git_branch_name, 'is_dirty': bool(git_status), 'status': git_status, 'sha': git_sha }, writer, indent=4) except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e: LOGGER.exception(e) LOGGER.warn("Git info not found. Saving code into zip instead...") # save a copy of the codebase. # !!!Do not store heavy file in your codebase when using it. code_dir = dirname(dirname(realpath(__file__))) code_zip_filename = os.path.join(args.output_dir, "code.zip") LOGGER.info(f"Saving code from {code_dir} to {code_zip_filename}...") make_zipfile(code_dir, code_zip_filename, enclosing_dir="code", exclude_dirs_substring="results", exclude_dirs=["results", "debug_results", "__pycache__"], exclude_extensions=[".pyc", ".ipynb", ".swap"]) LOGGER.info("Saving code done.")
def eval_language_metrics(checkpoint, eval_data_loader, opt, model=None, eval_mode="val"): """eval_mode can only be set to `val` here, as setting to `test` is cheating 0, run inference 1, Get METEOR, BLEU1-4, CIDEr scores 2, Get vocab size, sentence length """ translator = Translator(opt, checkpoint, model=model) json_res = run_translate(eval_data_loader, translator, opt=opt) res_filepath = os.path.abspath( opt.save_model + "_tmp_greedy_pred_{}.json".format(eval_mode)) save_jsonl(json_res, res_filepath) # COCO language evaluation reference_path = os.path.abspath(opt.reference_path) metrics_path = res_filepath.replace(".json", "_lang_metrics.json") eval_cmd = [ "python", "evaluate.py", "-s", res_filepath, "-o", metrics_path, "-r", reference_path ] subprocess.call(eval_cmd, cwd="standalone_eval") metrics = load_json(metrics_path) return metrics, [res_filepath, metrics_path]
def load_model(opts, device): hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(load_json(hps_file)) model_config = f'{opts.output_dir}/log/model_config.json' # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) img_pos_embed_weight_key = ( "v_encoder.f_encoder.img_embeddings.position_embeddings.weight") assert img_pos_embed_weight_key in checkpoint max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) model = HeroForVcmr.from_pretrained( model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=model_opts.lw_neg_ctx, lw_neg_q=model_opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=model_opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=model_opts.hard_pool_size, margin=model_opts.margin, use_all_neg=model_opts.use_all_neg, drop_svmr_prob=model_opts.drop_svmr_prob) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') return model, model_opts
def load_external_vr_res2(external_vr_res_path, top_n_vr_videos=5): """return a mapping from desc_id to top retrieved video info""" external_vr_res = load_json(external_vr_res_path) external_vr_res = get_submission_top_n(external_vr_res, top_n=top_n_vr_videos)["VR"] query2video = {e["desc_id"]: e["predictions"] for e in external_vr_res} return query2video
def load_saved_res(pred_path): if pred_path.endswith(".json"): pred = load_json(pred_path) else: pred = torch.load(pred_path) vcmr_res = {e["desc_id"]: e for e in pred["VCMR"]} video2idx = pred["video2idx"] return vcmr_res, video2idx
def __init__(self, ctx_mode, data_path, sub_meta_path, vid_h5_path_or_handler, word2idx_path, max_cap_len, max_v_len, max_sub_len, h5driver=None, clip_length=1.5, normalize_vfeat=True, is_eval=False, data_ratio=1.0): self.ctx_mode = ctx_mode self.use_video = "video" in ctx_mode self.use_sub = "sub" in ctx_mode self.is_eval = is_eval self.data_ratio = data_ratio self.word2idx = load_json(word2idx_path) self.idx2word = {int(v): k for k, v in self.word2idx.items()} self.clip_length = clip_length self.sub_meta_path = sub_meta_path self.max_v_len = max_v_len self.max_cap_len = max_cap_len # sen self.max_sub_len = max_sub_len self.normalize_vfeat = normalize_vfeat if self.use_video: if isinstance(vid_h5_path_or_handler, h5py.File): self.vid_h5 = vid_h5_path_or_handler else: self.vid_h5 = h5py.File(vid_h5_path_or_handler, "r", driver=h5driver) if self.use_sub: self.sub_meta_dict = load_process_sub_meta(sub_meta_path, clip_length) self.word2idx = load_json(word2idx_path) self.idx2word = {int(v): k for k, v in self.word2idx.items()} self.data = self._load_data(data_path)
def load_external_vr_res_with_scores(external_vr_res_path, top_n_vr_videos=5): """return a mapping from desc_id to top retrieved (vid_name, score)""" external_vr_res = load_json(external_vr_res_path) external_vr_res = get_submission_top_n(external_vr_res, top_n=top_n_vr_videos)["VR"] query2video = { e["desc_id"]: [[sub_e[0], sub_e[3]] for sub_e in e["predictions"]] for e in external_vr_res } return query2video
def main(opts): if not exists(opts.output): os.makedirs(opts.output) else: raise ValueError('Found existing DB. Please explicitly remove ' 'for re-processing') meta = vars(opts) meta['tokenizer'] = opts.toker toker = RobertaTokenizer.from_pretrained(opts.toker) tokenizer = roberta_tokenize(toker) meta['BOS'] = toker.convert_tokens_to_ids(['<s>'])[0] meta['EOS'] = toker.convert_tokens_to_ids(['</s>'])[0] meta['SEP'] = toker.convert_tokens_to_ids(['</s>'])[0] meta['CLS'] = toker.convert_tokens_to_ids(['<s>'])[0] meta['PAD'] = toker.convert_tokens_to_ids(['<pad>'])[0] meta['MASK'] = toker.convert_tokens_to_ids(['<mask>'])[0] meta['UNK'] = toker.convert_tokens_to_ids(['<unk>'])[0] meta['v_range'] = (toker.convert_tokens_to_ids(['.'])[0], toker.convert_tokens_to_ids(['<|endoftext|>'])[0] + 1) save_json(vars(opts), f'{opts.output}/meta.json', save_pretty=True) open_db = curry(open_lmdb, opts.output, readonly=False) with open_db() as db: sub_info_cache_path = f'{opts.output}/sub_info.json' try: vid2nframe = load_json(opts.vid2nframe) except Exception: vid2nframe = None if not os.path.exists(sub_info_cache_path): video2sub_info = load_process_sub_meta( opts.annotation, vid2nframe, frame_length=args.frame_length) save_json(video2sub_info, sub_info_cache_path) else: video2sub_info = load_json(sub_info_cache_path) with open(opts.annotation) as ann: vid2len, vid2max_frame_sub_len = process_tv_subtitles( ann, video2sub_info, db, tokenizer, meta['SEP']) save_json(vid2len, f'{opts.output}/vid2len.json') save_json(vid2max_frame_sub_len, f'{opts.output}/vid2max_frame_sub_len.json')
def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler, max_desc_len, max_ctx_len, vid_feat_path_or_handler, clip_length, ctx_mode="video", normalize_vfeat=True, normalize_tfeat=True, h5driver=None, data_ratio=1.0, video_duration_idx_path=None, eval_split_name=None): self.dset_name = dset_name self.data_path = data_path self.data_ratio = data_ratio self.desc_bert_path_or_handler = desc_bert_path_or_handler self.max_desc_len = max_desc_len self.sub_bert_path_or_handler = sub_bert_path_or_handler self.max_ctx_len = max_ctx_len self.vid_feat_path_or_handler = vid_feat_path_or_handler self.clip_length = clip_length self.ctx_mode = ctx_mode # prepare desc data self.data = load_jsonl(data_path) if self.data_ratio != 1: n_examples = int(len(self.data) * data_ratio) self.data = self.data[:n_examples] logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) self.use_video = "video" in self.ctx_mode self.use_sub = "sub" in self.ctx_mode self.use_tef = "tef" in self.ctx_mode if self.use_video: if isinstance(vid_feat_path_or_handler, h5py.File): self.vid_feat_h5 = vid_feat_path_or_handler else: # str path self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) if isinstance(desc_bert_path_or_handler, h5py.File): self.desc_bert_h5 = desc_bert_path_or_handler else: self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) if self.use_sub: if isinstance(sub_bert_path_or_handler, h5py.File): self.sub_bert_h5 = sub_bert_path_or_handler else: # str path self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) self.normalize_vfeat = normalize_vfeat self.normalize_tfeat = normalize_tfeat if video_duration_idx_path is not None: video_data = load_json(video_duration_idx_path)[eval_split_name] self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()] self.video2idx = {k: v[1] for k, v in video_data.items()}
def __init__(self, dset_name, eval_split_name, data_path=None, desc_bert_path_or_handler=None, max_desc_len=None, max_ctx_len=None, sub_bert_path_or_handler=None, vid_feat_path_or_handler=None, video_duration_idx_path=None, clip_length=None, ctx_mode="video", data_mode="context", h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True): self.dset_name = dset_name self.eval_split_name = eval_split_name self.ctx_mode = ctx_mode self.load_gt_video = False self.data_ratio = data_ratio # only affect query data self.normalize_vfeat = normalize_vfeat self.normalize_tfeat = normalize_tfeat self.data_mode = None self.set_data_mode(data_mode) self.max_desc_len = max_desc_len self.max_ctx_len = max_ctx_len self.data_path = data_path self.query_data = load_jsonl(data_path) if data_ratio != 1: n_examples = int(len(self.query_data) * data_ratio) self.query_data = self.query_data[:n_examples] logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) if isinstance(desc_bert_path_or_handler, h5py.File): self.desc_bert_h5 = desc_bert_path_or_handler else: self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) video_data = load_json(video_duration_idx_path)[self.eval_split_name] self.video_data = {k: v[0] for k, v in video_data.items()} self.video2idx = {k: v[1] for k, v in video_data.items()} self.clip_length = clip_length self.use_video = "video" in self.ctx_mode self.use_sub = "sub" in self.ctx_mode self.use_tef = "tef" in self.ctx_mode if self.use_video: if isinstance(vid_feat_path_or_handler, h5py.File): self.vid_feat_h5 = vid_feat_path_or_handler else: # str path self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) if self.use_sub: if isinstance(sub_bert_path_or_handler, h5py.File): self.sub_bert_h5 = sub_bert_path_or_handler else: # str path self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
def build_dataloader(opts): # Load ground truth, query db and video db hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(load_json(hps_file)) video_ids = get_video_ids(opts.query_txt_db) video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, model_opts.vfeat_interval, model_opts) assert opts.split in opts.query_txt_db, (opts.split, opts.query_txt_db) q_txt_db = QueryTokLmdb(opts.query_txt_db, -1) eval_dataset = VcmrFullEvalDataset(video_ids, video_db, q_txt_db, distributed=model_opts.distributed_eval) eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) return eval_dataloader
def main_convert(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--src_h5_file", type=str, help="subtitle words level feature .h5 file") parser.add_argument("--vid_clip_h5_file", type=str, help="video clip level feature .h5 file") parser.add_argument("--sub_meta_path", type=str, help="processed subtitle .jsonl path") parser.add_argument("--tgt_h5_file", type=str, help=".h5 path to stores the converted data") parser.add_argument("--pool_type", type=str, default="max", choices=["max", "avg"], help="how to aggreate frame features") parser.add_argument("--clip_length", type=float, default=1.5) parser.add_argument("--debug", action="store_true") args = parser.parse_args() sub_info_cache_path = args.tgt_h5_file.replace(".h5", "_sub_info.json") if not os.path.exists(sub_info_cache_path): video2sub_info = load_process_sub_meta(args.sub_meta_path, clip_length=args.clip_length) save_json(video2sub_info, sub_info_cache_path) else: video2sub_info = load_json(sub_info_cache_path) with h5py.File(args.src_h5_file, "r") as src_h5: with h5py.File(args.vid_clip_h5_file, "r") as vid_clip_h5: with h5py.File(args.tgt_h5_file, "w") as tgt_h5: convert_h5(src_h5, vid_clip_h5, tgt_h5, video2sub_info, pool_type=args.pool_type, debug=args.debug)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) opts.n_gpu = n_gpu LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True set_random_seed(opts.seed) # train_examples = None LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, " f"{opts.vfeat_db}") if opts.task != "didemo_video_only": video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, opts.vfeat_interval, opts) else: txt_meta = load_json(join(opts.train_query_txt_db, "meta.json")) video_db = load_video_only_dataset(opts.vfeat_db, txt_meta, opts.vfeat_interval, opts) # data loaders # train video_ids = get_video_ids(opts.train_query_txt_db) train_q_txt_db = QueryTokLmdb(opts.train_query_txt_db, opts.max_txt_len) train_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, True, opts, shuffle=True, q_txt_db=train_q_txt_db) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # val video_ids = get_video_ids(opts.val_query_txt_db) val_q_txt_db = QueryTokLmdb(opts.val_query_txt_db, -1) val_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, False, opts, q_txt_db=val_q_txt_db) if opts.task != "didemo_video_only": inf_dataset = VcmrFullEvalDataset else: inf_dataset = VcmrVideoOnlyFullEvalDataset LOGGER.info(f"Loading Inference Dataset {opts.val_query_txt_db} (val)") val_dset = inf_dataset(video_ids, video_db, val_q_txt_db, distributed=opts.distributed_eval) inf_loader_val = DataLoader(val_dset, batch_size=opts.vcmr_eval_q_batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) inf_loader_val = PrefetchLoader(inf_loader_val) if opts.test_query_txt_db: LOGGER.info( f"Loading Inference Dataset {opts.test_query_txt_db} (test)") video_ids = get_video_ids(opts.test_query_txt_db) test_q_txt_db = QueryTokLmdb(opts.test_query_txt_db, -1) test_dset = inf_dataset(video_ids, video_db, test_q_txt_db, distributed=opts.distributed_eval) inf_loader_test = DataLoader(test_dset, batch_size=opts.vcmr_eval_q_batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) inf_loader_test = PrefetchLoader(inf_loader_test) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" if img_pos_embed_weight_key in checkpoint: max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) else: max_frm_seq_len = MAX_FRM_SEQ_LEN model = HeroForVcmr.from_pretrained( opts.model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=opts.lw_neg_ctx, lw_neg_q=opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=opts.hard_pool_size, margin=opts.margin, use_all_neg=opts.use_all_neg, drop_svmr_prob=opts.drop_svmr_prob) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') restorer = TrainingRestorer(opts, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) if not exists(join(opts.output_dir, 'results')): # store tvr predictions os.makedirs(join(opts.output_dir, 'results')) if opts.nms_thd != -1: # store tvr-nms predictions if not exists(join(opts.output_dir, 'results_nms')): os.makedirs(join(opts.output_dir, 'results_nms')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) task2loss = { task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys() } for obj in (f'{opts.task}_st_ed', f'{opts.task}_neg_ctx', f'{opts.task}_neg_q'): task2loss[obj] = RunningMeter(f'loss/{obj}') model.train() n_examples = defaultdict(int) start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() if global_step == 0: optimizer.step() for step, (task, batch) in enumerate(meta_loader): if len(opts.hard_negtiave_start_step) > 0: for i, hn_step in enumerate(opts.hard_negtiave_start_step): if global_step >= hn_step and hn_step != -1: model.set_hard_negative(True, opts.hard_pool_size[i], opts.hard_neg_weights[i]) if opts.train_span_start_step != -1 and\ global_step >= opts.train_span_start_step: model.set_train_st_ed(opts.lw_st_ed) n_examples[task] += opts.train_batch_size loss = model(batch, task=task, compute_loss=True) loss_st_ed, loss_neg_ctx, loss_neg_q = loss loss = loss_st_ed + loss_neg_ctx + loss_neg_q for n, ls, w in (('st_ed', loss_st_ed, opts.lw_st_ed), ('neg_ctx', loss_neg_ctx, opts.lw_neg_ctx), ('neg_q', loss_neg_q, opts.lw_neg_q)): ls = ls.item() if w: ls /= w task2loss[f'{task}_{n}'](ls) loss = loss.mean() task2loss[task](loss.item()) delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[task]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss TB_LOGGER.log_scaler_dict({ temp_loss.name: temp_loss.val for temp_loss in task2loss.values() if temp_loss.val is not None }) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info('-------------------------------------------') LOGGER.info(f'Step {global_step}:') for t in train_dataloaders.keys(): tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, opts) if hvd.rank() == 0 or opts.distributed_eval: log, results = validate_full_vcmr(model, inf_loader_val, 'val', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'val_results_{global_step}_rank{hvd.rank()}.json') TB_LOGGER.log_scaler_dict(log) if opts.test_query_txt_db: log, results = validate_full_vcmr(model, inf_loader_test, 'test', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'test_results_{global_step}_rank{hvd.rank()}.json' ) TB_LOGGER.log_scaler_dict(log) LOGGER.info('===========================================') model_saver.save(model, global_step) # step restorer in the end to prevent missing validation checkpoint restorer.step() if global_step >= opts.num_train_steps: break LOGGER.info('===========================================') if global_step % opts.valid_steps != 0: if hvd.rank() == 0 or opts.distributed_eval: log, results = validate_full_vcmr(model, inf_loader_val, 'val', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'val_results_{global_step}' f'_rank{hvd.rank()}_final.json') TB_LOGGER.log_scaler_dict(log) if opts.test_query_txt_db: log, results = validate_full_vcmr(model, inf_loader_test, 'test', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'test_results_{global_step}_rank{hvd.rank()}.json') TB_LOGGER.log_scaler_dict(log) model_saver.save(model, f'{global_step}_final')
def parse(self): if not self.initialized: self.initialize() opt = self.parser.parse_args() if opt.debug: opt.results_root = os.path.sep.join( opt.results_root.split(os.path.sep)[:-1] + [ "debug_results", ]) opt.no_core_driver = True opt.num_workers = 0 opt.eval_query_bsz = 100 if isinstance(self, TestOptions): # modify model_dir to absolute path opt.model_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir) saved_options = load_json( os.path.join(opt.model_dir, self.saved_option_filename)) for arg in saved_options: # use saved options to overwrite all BaseOptions args. if arg not in [ "results_root", "num_workers", "nms_thd", "debug", "eval_split_name", "eval_path", "eval_query_bsz", "eval_context_bsz", "max_pred_l", "min_pred_l", "external_inference_vr_res_path" ]: setattr(opt, arg, saved_options[arg]) # opt.no_core_driver = True else: if opt.exp_id is None: raise ValueError( "--exp_id is required for at a training option!") if opt.clip_length is None: opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"] print("Loaded clip_length {} from proposal config file".format( opt.clip_length)) opt.results_dir = os.path.join( opt.results_root, "-".join([ opt.dset_name, opt.ctx_mode, opt.exp_id, time.strftime("%Y_%m_%d_%H_%M_%S") ])) mkdirp(opt.results_dir) # save a copy of current code code_dir = os.path.dirname(os.path.realpath(__file__)) code_zip_filename = os.path.join(opt.results_dir, "code.zip") make_zipfile( code_dir, code_zip_filename, enclosing_dir="code", exclude_dirs_substring="results", exclude_dirs=["results", "debug_results", "__pycache__"], exclude_extensions=[".pyc", ".ipynb", ".swap"], ) self.display_save(opt) if "sub" in opt.ctx_mode: assert opt.dset_name == "tvr", "sub is only supported for tvr dataset" if opt.hard_negtiave_start_epoch != -1: if opt.hard_pool_size > opt.bsz: print("[WARNING] hard_pool_size is larger than bsz") assert opt.stop_task in opt.eval_tasks_at_training opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename) opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename) opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename) opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir) opt.device = torch.device( "cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu") opt.h5driver = None if opt.no_core_driver else "core" # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5 opt.num_workers = 1 if opt.no_core_driver else opt.num_workers opt.pin_memory = not opt.no_pin_memory if "video" in opt.ctx_mode and opt.vid_feat_size > 3000: # 3072, the normalized concatenation of resnet+i3d assert opt.no_norm_vfeat if "tef" in opt.ctx_mode and "video" in opt.ctx_mode: opt.vid_feat_size += 2 if "tef" in opt.ctx_mode and "sub" in opt.ctx_mode: opt.sub_feat_size += 2 if "video" not in opt.ctx_mode or "sub" not in opt.ctx_mode: opt.no_merge_two_stream = True opt.no_cross_att = True self.opt = opt return opt
def __init__(self, dset_name, data_path, desc_bert_path, sub_bert_path, max_desc_len, vid_feat_path, clip_length, vid_feat_size, sub_feat_size=0, ctx_mode="video_tef", pos_iou_thd=0.7, neg_iou_thd=0.3, h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True, model_type="cal", external_train_vr_res_path=None, video_duration_idx_path=None): self.dset_name = dset_name self.model_type = model_type self.pool_local = model_type == "mcn" # pool local feature self.data_path = data_path self.data_ratio = data_ratio self.desc_bert_path = desc_bert_path self.max_desc_len = max_desc_len self.sub_bert_path = sub_bert_path self.vid_feat_path = vid_feat_path self.clip_length = clip_length self.ctx_mode = ctx_mode self.pos_iou_thd = pos_iou_thd self.neg_iou_thd = neg_iou_thd self.vid_feat_output_size = 2 * vid_feat_size * ( "video" in ctx_mode) + 2 * ("tef" in ctx_mode) self.sub_feat_output_size = 2 * sub_feat_size * ( "sub" in ctx_mode) + 2 * ("tef" in ctx_mode) # prepare desc data self.data = load_jsonl(data_path) if self.data_ratio != 1: n_examples = int(len(self.data) * data_ratio) self.data = self.data[:n_examples] logger.info("Using {}% of the data: {} examples".format( data_ratio * 100, n_examples)) self.proposal_fn = get_proposal_interface(dset_name) if self.ctx_mode != "tef": self.vid_feat_h5 = h5py.File(self.vid_feat_path, "r", driver=h5driver) self.desc_bert_h5 = h5py.File(self.desc_bert_path, "r", driver=h5driver) if "sub" in self.ctx_mode: self.sub_bert_h5 = h5py.File(self.sub_bert_path, "r", driver=h5driver) self.normalize_vfeat = normalize_vfeat self.normalize_tfeat = normalize_tfeat self.use_video = "video" in self.ctx_mode self.use_sub = "sub" in self.ctx_mode self.use_tef = "tef" in self.ctx_mode if external_train_vr_res_path is not None: video_data = load_json(video_duration_idx_path)["train"] # {video_idx: [vid_name, vid_duration]} video_idx2name_dur_pair = { v[1]: [k, v[0]] for k, v in video_data.items() } external_vr_res = load_json(external_train_vr_res_path) # {desc_id: [(vid_name, vid_duration), ...]} self.desc_id2video_names_dur_pairs = \ {e["desc_id"]: [video_idx2name_dur_pair[int(sub_e[0])] for sub_e in e["predictions"]] for e in external_vr_res["VR"]} # ordered
def parse(self): if not self.initialized: self.initialize() opt = self.parser.parse_args() if opt.debug: opt.results_root = os.path.sep.join( opt.results_root.split(os.path.sep)[:-1] + [ "debug_results", ]) opt.no_core_driver = True opt.num_workers = 0 if isinstance(self, TestOptions): # modify model_dir to absolute path opt.model_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir) saved_options = load_json( os.path.join(opt.model_dir, self.saved_option_filename)) for arg in saved_options: # use saved options to overwrite all BaseOptions args. if arg not in [ "results_root", "num_workers", "nms_thd", "debug", "eval_split_name", "eval_path", "use_intermediate", "external_inference_vr_res_path" ]: setattr(opt, arg, saved_options[arg]) # opt.no_core_driver = True else: if opt.exp_id is None: raise ValueError( "--exp_id is required for at a training option!") if opt.clip_length is None: opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"] opt.results_dir = os.path.join( opt.results_root, "-".join([ opt.dset_name, opt.model_type, opt.ctx_mode, opt.exp_id, time.strftime("%Y_%m_%d_%H_%M_%S") ])) mkdirp(opt.results_dir) # save a copy of current code code_dir = os.path.dirname(os.path.realpath(__file__)) code_zip_filename = os.path.join(opt.results_dir, "code.zip") make_zipfile( code_dir, code_zip_filename, enclosing_dir="code", exclude_dirs_substring="results", exclude_dirs=["results", "debug_results", "__pycache__"], exclude_extensions=[".pyc", ".ipynb", ".swap"]) self.save_args(opt) if "sub" in opt.ctx_mode: assert opt.dset_name == "tvr", "sub is only supported for tvr dataset" if "video" in opt.ctx_mode and opt.vid_feat_size > 3000: # 3072, the normalized concatenation of resnet+i3d assert opt.no_norm_vfeat opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename) opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename) opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename) opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir) opt.device = torch.device( "cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu") opt.h5driver = None if opt.no_core_driver else "core" # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5 opt.pin_memory = not opt.no_pin_memory opt.num_workers = 1 if opt.no_core_driver else opt.num_workers # Display settings print("------------ Options -------------\n{}\n-------------------". format({str(k): str(v) for k, v in sorted(vars(opt).items())})) self.opt = opt return opt
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, 16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(load_json(hps_file)) model_config = f'{opts.output_dir}/log/model_config.json' # load DBs and image dirs video_ids = get_video_ids(opts.query_txt_db) if opts.task != "didemo_video_only": video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, model_opts.vfeat_interval, model_opts) else: txt_meta = load_json(os.path.join(opts.query_txt_db, "meta.json")) video_db = load_video_only_dataset(opts.vfeat_db, txt_meta, model_opts.vfeat_interval, model_opts) assert opts.split in opts.query_txt_db q_txt_db = QueryTokLmdb(opts.query_txt_db, -1) if opts.task != "didemo_video_only": inf_dataset = VcmrFullEvalDataset else: inf_dataset = VcmrVideoOnlyFullEvalDataset eval_dataset = inf_dataset(video_ids, video_db, q_txt_db, distributed=model_opts.distributed_eval) # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) img_pos_embed_weight_key = ("v_encoder.f_encoder.img_embeddings.position_embeddings.weight") assert img_pos_embed_weight_key in checkpoint max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) model = HeroForVcmr.from_pretrained( model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=model_opts.lw_neg_ctx, lw_neg_q=model_opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=model_opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=model_opts.hard_pool_size, margin=model_opts.margin, use_all_neg=model_opts.use_all_neg, drop_svmr_prob=model_opts.drop_svmr_prob) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) _, results = validate_full_vcmr(model, eval_dataloader, opts.split, opts, model_opts) result_dir = f'{opts.output_dir}/results_{opts.split}' if not exists(result_dir) and rank == 0: os.makedirs(result_dir) all_results_list = all_gather_list(results) if hvd.rank() == 0: # save for only one time all_results = {"video2idx": all_results_list[0]["video2idx"]} for rank_id in range(hvd.size()): for key, val in all_results_list[rank_id].items(): if key == "video2idx": continue if key not in all_results: all_results[key] = [] all_results[key].extend(all_results_list[rank_id][key]) LOGGER.info('All results joined......') # save_vr(all_results, f'{result_dir}/results_{opts.checkpoint}_{opts.split}_vr.json') # save_vcmr_base_on_vr(all_results, f'{result_dir}/results_{opts.checkpoint}_{opts.split}_vcmr_base_on_vr.json') save_vcmr(all_results, f'{result_dir}/results_{opts.checkpoint}_{opts.split}_vcmr.json')