def preprocess_subtitles_from_dir(srt_dir, save_path): """ return: A python dict, the keys are the video names, the entries are lists, each contains all the text from a .srt file sub_times are the start time of the sentences. """ assert not os.path.exists(save_path), "File {} already exists".format( save_path) print("Loading srt files from %s ..." % srt_dir) srt_paths = glob.glob(os.path.join(srt_dir, "*.srt")) srt_datalist = [] for sub_path in tqdm(srt_paths, desc="Loop over subtitle files"): subs = pysrt.open(sub_path, encoding="iso-8859-1") if len(subs) == 0: subs = pysrt.open(sub_path) sub_data = [] for cur_sub in subs: sub_data.append( dict(text=clean_single_sub_sentence(cur_sub.text), start=convert_sub_time_to_seconds(cur_sub.start), end=convert_sub_time_to_seconds(cur_sub.end))) srt_datalist.append( dict(vid_name=os.path.splitext(os.path.basename(sub_path))[0], sub=sub_data)) save_jsonl(srt_datalist, save_path)
def get_tokenized_text(args, tokenizer, prefix=""): """Many of the extraction args are inherited from evaluation""" extract_dataset = load_and_cache_examples(tokenizer, args.train_data_file, args.block_size, sub_data_file=args.sub_data_file, filter_file_path=None, load_query=True, debug=args.debug) # Eval! logger.info("***** Running Tokenization {} *****".format(prefix)) logger.info(" Num examples = %d", len(extract_dataset)) tokenized_desc_data = [] # each element is a dict {id: str, text: raw text string, tokens: tokenized text} for d in tqdm(extract_dataset): tokenized_d = dict( id=d["id"], text=d["text"], tokens=tokenizer.convert_ids_to_tokens(d["text_ids"], skip_special_tokens=True), tokenized_text=tokenizer.decode(d["text_ids"], skip_special_tokens=True, clean_up_tokenization_spaces=False)) tokenized_desc_data.append(tokenized_d) output_extraction_file = os.path.join(args.output_dir, args.extracted_file_name) save_jsonl(tokenized_desc_data, output_extraction_file)
def eval_language_metrics(checkpoint, eval_data_loader, opt, model=None, eval_mode="val"): """eval_mode can only be set to `val` here, as setting to `test` is cheating 0, run inference 1, Get METEOR, BLEU1-4, CIDEr scores 2, Get vocab size, sentence length """ translator = Translator(opt, checkpoint, model=model) json_res = run_translate(eval_data_loader, translator, opt=opt) res_filepath = os.path.abspath( opt.save_model + "_tmp_greedy_pred_{}.json".format(eval_mode)) save_jsonl(json_res, res_filepath) # COCO language evaluation reference_path = os.path.abspath(opt.reference_path) metrics_path = res_filepath.replace(".json", "_lang_metrics.json") eval_cmd = [ "python", "evaluate.py", "-s", res_filepath, "-o", metrics_path, "-r", reference_path ] subprocess.call(eval_cmd, cwd="standalone_eval") metrics = load_json(metrics_path) return metrics, [res_filepath, metrics_path]
def main(opts): hvd.init() if hvd.rank() == 0: toker = RobertaTokenizer.from_pretrained('roberta-base') all_gather_list(None) else: all_gather_list(None) toker = RobertaTokenizer.from_pretrained('roberta-base') model_opts = Struct(json.load(open(f"{opts.model_dir}/log/hps.json"))) model_config = f"{opts.model_dir}/log/model_config.json" video_db = load_video_sub_dataset(model_opts.vfeat_db, model_opts.sub_txt_db, model_opts.vfeat_interval, model_opts) dset = TvcEvalDataset(video_db, opts.target_clip) loader = build_dataloader(dset, opts.batch_size, TvcEvalDataset.collate, False, opts) checkpoint = torch.load(f"{opts.model_dir}/ckpt/" f"model_step_{opts.ckpt_step}.pt") img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" if img_pos_embed_weight_key in checkpoint: max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) else: max_frm_seq_len = MAX_FRM_SEQ_LEN model = HeroForTvc.from_pretrained(model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lsr=model_opts.lsr) model.cuda() model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') bos = toker.convert_tokens_to_ids(['<s>'])[0] eos = toker.convert_tokens_to_ids(['</s>'])[0] model.eval() generator = TvcGenerator(model, opts.max_gen_step, bos, eos, opts.fp16) results = decode(loader, generator, toker) save_jsonl(results, opts.output) # evaluate score if possible if (hvd.rank() == 0 and 'descs' in json.loads(next(iter(open(opts.target_clip))))): evaluator = TVCEval(opts.target_clip) score = evaluator(results) print(score)
def main(opts): if not exists(opts.output): os.makedirs(opts.output) else: raise ValueError('Found existing DB. Please explicitly remove ' 'for re-processing') meta = vars(opts) meta['tokenizer'] = opts.toker toker = RobertaTokenizer.from_pretrained( opts.toker) tokenizer = roberta_tokenize(toker) meta['BOS'] = toker.convert_tokens_to_ids(['<s>'])[0] meta['EOS'] = toker.convert_tokens_to_ids(['</s>'])[0] meta['SEP'] = toker.convert_tokens_to_ids(['</s>'])[0] meta['CLS'] = toker.convert_tokens_to_ids(['<s>'])[0] meta['PAD'] = toker.convert_tokens_to_ids(['<pad>'])[0] meta['MASK'] = toker.convert_tokens_to_ids(['<mask>'])[0] meta['UNK'] = toker.convert_tokens_to_ids(['<unk>'])[0] meta['v_range'] = (toker.convert_tokens_to_ids(['.'])[0], toker.convert_tokens_to_ids(['<|endoftext|>'])[0]+1) save_json(vars(opts), f'{opts.output}/meta.json', save_pretty=True) open_db = curry(open_lmdb, opts.output, readonly=False) with open_db() as db: with open(opts.annotation, "r") as ann: if opts.task == "tvr": id2lens, query2video, query_data = process_tvr( ann, db, tokenizer) elif opts.task == "tvqa": id2lens, query2video, query_data = process_tvqa( ann, db, tokenizer) elif opts.task == "violin": id2lens, query2video, query_data = process_violin( ann, db, tokenizer) else: raise NotImplementedError( f"prepro for {opts.task} not implemented") save_json(id2lens, f'{opts.output}/id2len.json') save_json(query2video, f'{opts.output}/query2video.json') save_jsonl(query_data, f'{opts.output}/query_data.jsonl')
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) opts.task = 'tvc' # train_examples = None LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, " f"{opts.vfeat_db}") video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, opts.vfeat_interval, opts) # data loaders # train LOGGER.info(f"Loading train dataset {opts.train_db}") train_cap = CaptionTokLmdb(opts.train_db, opts.max_txt_len) train_dset = TvcTrainDataset(video_db, train_cap, opts.max_cap_per_vid) LOGGER.info(f"{sum(all_gather_list(len(train_dset)))} samples loaded") train_loader = build_dataloader(train_dset, opts.train_batch_size, TvcTrainDataset.collate, True, opts) # val LOGGER.info(f"Loading val dataset {opts.val_db}") val_cap = CaptionTokLmdb(opts.val_db, -1) val_dset = TvcValDataset(video_db, val_cap, -1) val_loader = build_dataloader(val_dset, opts.val_batch_size, TvcValDataset.collate, False, opts) if hvd.rank() == 0: evaluator = TVCEval(opts.val_ref) else: evaluator = NoOp() # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" if img_pos_embed_weight_key in checkpoint: max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) else: max_frm_seq_len = MAX_FRM_SEQ_LEN model = HeroForTvc.from_pretrained(opts.model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lsr=opts.lsr) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') # assumes roberta tokenizer only if hvd.local_rank() == 0: # quick hack to prevent multi-process download collision toker = RobertaTokenizer.from_pretrained('roberta-base') all_gather_list(None) else: all_gather_list(None) toker = RobertaTokenizer.from_pretrained('roberta-base') bos = toker.convert_tokens_to_ids(['<s>'])[0] eos = toker.convert_tokens_to_ids(['</s>'])[0] generator = TvcGenerator(model, opts.max_gen_step, bos, eos, opts.fp16) global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) os.makedirs(join(opts.output_dir, 'results')) # store val predictions add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) train_loss = RunningMeter('loss') n_vid = 0 n_cap = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() model.train() while True: for step, batch in enumerate(train_loader): n_vid += opts.train_batch_size n_cap += batch['cap_input_ids'].size(0) loss = model(batch, compute_loss=True) loss = loss.mean() train_loss(loss.item()) delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for i, param_group in enumerate(optimizer.param_groups): if i == 0 or i == 1: param_group['lr'] = lr_this_step * opts.lr_mul elif i == 2 or i == 3: param_group['lr'] = lr_this_step else: raise ValueError() TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss TB_LOGGER.add_scalar(train_loss.name, train_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info('-------------------------------------------') LOGGER.info(f'Step {global_step}:') tot_vid = sum(all_gather_list(n_vid)) vid_per_sec = int(tot_vid / (time() - start)) LOGGER.info(f'{tot_vid} videos trained at ' f'{vid_per_sec} vid/s') tot_cap = sum(all_gather_list(n_cap)) cap_per_sec = int(tot_cap / (time() - start)) TB_LOGGER.add_scalar(f'perf/vid_per_s', vid_per_sec, global_step) TB_LOGGER.add_scalar(f'perf/cap_per_s', cap_per_sec, global_step) if global_step % opts.valid_steps == 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start validation") val_log, results = validate(val_loader, generator, toker, evaluator) if hvd.rank() == 0: save_jsonl( results, f"{opts.output_dir}/results/" f"/results_{global_step}.jsonl") TB_LOGGER.log_scaler_dict(val_log) LOGGER.info('===========================================') model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"finished {n_epoch} epochs") if global_step >= opts.num_train_steps: break LOGGER.info('===========================================') if global_step % opts.valid_steps != 0: val_log, results = validate(val_loader, generator, toker, evaluator) if hvd.rank() == 0: save_jsonl( results, f"{opts.output_dir}/results/" f"/results_{global_step}.jsonl") TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step)
def main(): parser = argparse.ArgumentParser(description="translate.py") parser.add_argument("-eval_split_name", choices=["val", "test_public"]) parser.add_argument("-eval_path", type=str, help="Path to eval data") parser.add_argument("-reference_path", type=str, default=None, help="Path to reference") parser.add_argument("-res_dir", required=True, help="path to dir containing model .pt file") parser.add_argument("-batch_size", type=int, default=100, help="batch size") # beam search configs parser.add_argument("-use_beam", action="store_true", help="use beam search, otherwise greedy search") parser.add_argument("-beam_size", type=int, default=2, help="beam size") parser.add_argument("-n_best", type=int, default=1, help="stop searching when get n_best from beam search") parser.add_argument("-min_sen_len", type=int, default=8, help="minimum length of the decoded sentences") parser.add_argument("-max_sen_len", type=int, default=25, help="maximum length of the decoded sentences") parser.add_argument("-block_ngram_repeat", type=int, default=0, help="block repetition of ngrams during decoding.") parser.add_argument("-length_penalty_name", default="none", choices=["none", "wu", "avg"], help="length penalty to use.") parser.add_argument( "-length_penalty_alpha", type=float, default=0., help="Google NMT length penalty parameter (higher = longer generation)" ) parser.add_argument("-no_cuda", action="store_true") parser.add_argument("-seed", default=2019, type=int) parser.add_argument("-debug", action="store_true") opt = parser.parse_args() opt.cuda = not opt.no_cuda # random seed random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) checkpoint = torch.load(os.path.join(opt.res_dir, "model.chkpt")) decoding_strategy = "beam{}_lp_{}_la_{}".format( opt.beam_size, opt.length_penalty_name, opt.length_penalty_alpha) if opt.use_beam else "greedy" save_json(vars(opt), os.path.join(opt.res_dir, "{}_eval_cfg.json".format(decoding_strategy)), save_pretty=True) # add some of the train configs train_opt = checkpoint[ "opt"] # EDict(load_json(os.path.join(opt.res_dir, "model.cfg.json"))) for k in train_opt.__dict__: if k not in opt.__dict__: setattr(opt, k, getattr(train_opt, k)) if "ctx_mode" not in opt: opt.ctx_mode = "video_sub" # temp hack, since the first experiment does not have such a setting eval_data_loader = get_data_loader(opt) # setup model translator = Translator(opt, checkpoint) pred_file = os.path.join( opt.res_dir, "{}_pred_{}.jsonl".format(decoding_strategy, opt.eval_split_name)) pred_file = os.path.abspath(pred_file) if not os.path.exists(pred_file): json_res = run_translate(eval_data_loader, translator, opt=opt) save_jsonl(json_res, pred_file) else: print("Using existing prediction file at {}".format(pred_file)) if opt.reference_path: # COCO language evaluation reference_path = os.path.abspath(opt.reference_path) metrics_path = pred_file.replace(".json", "_lang_metrics.json") eval_cmd = [ "python", "evaluate.py", "-s", pred_file, "-o", metrics_path, "-r", reference_path ] subprocess.call(eval_cmd, cwd="standalone_eval") print("[Info] Finished {}.".format(opt.eval_split_name))