Exemplo n.º 1
0
def preprocess_subtitles_from_dir(srt_dir, save_path):
    """
    return: A python dict, the keys are the video names, the entries are lists,
            each contains all the text from a .srt file
    sub_times are the start time of the sentences.
    """
    assert not os.path.exists(save_path), "File {} already exists".format(
        save_path)

    print("Loading srt files from %s ..." % srt_dir)
    srt_paths = glob.glob(os.path.join(srt_dir, "*.srt"))
    srt_datalist = []
    for sub_path in tqdm(srt_paths, desc="Loop over subtitle files"):
        subs = pysrt.open(sub_path, encoding="iso-8859-1")
        if len(subs) == 0:
            subs = pysrt.open(sub_path)

        sub_data = []
        for cur_sub in subs:
            sub_data.append(
                dict(text=clean_single_sub_sentence(cur_sub.text),
                     start=convert_sub_time_to_seconds(cur_sub.start),
                     end=convert_sub_time_to_seconds(cur_sub.end)))

        srt_datalist.append(
            dict(vid_name=os.path.splitext(os.path.basename(sub_path))[0],
                 sub=sub_data))
    save_jsonl(srt_datalist, save_path)
def get_tokenized_text(args, tokenizer, prefix=""):
    """Many of the extraction args are inherited from evaluation"""
    extract_dataset = load_and_cache_examples(tokenizer, args.train_data_file, args.block_size,
                                              sub_data_file=args.sub_data_file,
                                              filter_file_path=None,
                                              load_query=True,
                                              debug=args.debug)

    # Eval!
    logger.info("***** Running Tokenization {} *****".format(prefix))
    logger.info("  Num examples = %d", len(extract_dataset))

    tokenized_desc_data = []  # each element is a dict {id: str, text: raw text string, tokens: tokenized text}
    for d in tqdm(extract_dataset):
        tokenized_d = dict(
            id=d["id"],
            text=d["text"],
            tokens=tokenizer.convert_ids_to_tokens(d["text_ids"], skip_special_tokens=True),
            tokenized_text=tokenizer.decode(d["text_ids"],
                                            skip_special_tokens=True,
                                            clean_up_tokenization_spaces=False))
        tokenized_desc_data.append(tokenized_d)

    output_extraction_file = os.path.join(args.output_dir, args.extracted_file_name)
    save_jsonl(tokenized_desc_data, output_extraction_file)
Exemplo n.º 3
0
def eval_language_metrics(checkpoint,
                          eval_data_loader,
                          opt,
                          model=None,
                          eval_mode="val"):
    """eval_mode can only be set to `val` here, as setting to `test` is cheating
    0, run inference
    1, Get METEOR, BLEU1-4, CIDEr scores
    2, Get vocab size, sentence length
    """
    translator = Translator(opt, checkpoint, model=model)
    json_res = run_translate(eval_data_loader, translator, opt=opt)
    res_filepath = os.path.abspath(
        opt.save_model + "_tmp_greedy_pred_{}.json".format(eval_mode))
    save_jsonl(json_res, res_filepath)

    # COCO language evaluation
    reference_path = os.path.abspath(opt.reference_path)
    metrics_path = res_filepath.replace(".json", "_lang_metrics.json")
    eval_cmd = [
        "python", "evaluate.py", "-s", res_filepath, "-o", metrics_path, "-r",
        reference_path
    ]
    subprocess.call(eval_cmd, cwd="standalone_eval")
    metrics = load_json(metrics_path)
    return metrics, [res_filepath, metrics_path]
Exemplo n.º 4
0
def main(opts):
    hvd.init()
    if hvd.rank() == 0:
        toker = RobertaTokenizer.from_pretrained('roberta-base')
        all_gather_list(None)
    else:
        all_gather_list(None)
        toker = RobertaTokenizer.from_pretrained('roberta-base')

    model_opts = Struct(json.load(open(f"{opts.model_dir}/log/hps.json")))
    model_config = f"{opts.model_dir}/log/model_config.json"

    video_db = load_video_sub_dataset(model_opts.vfeat_db,
                                      model_opts.sub_txt_db,
                                      model_opts.vfeat_interval,
                                      model_opts)
    dset = TvcEvalDataset(video_db, opts.target_clip)
    loader = build_dataloader(dset, opts.batch_size,
                              TvcEvalDataset.collate, False, opts)

    checkpoint = torch.load(f"{opts.model_dir}/ckpt/"
                            f"model_step_{opts.ckpt_step}.pt")

    img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\
        ".position_embeddings.weight"
    if img_pos_embed_weight_key in checkpoint:
        max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key])
    else:
        max_frm_seq_len = MAX_FRM_SEQ_LEN

    model = HeroForTvc.from_pretrained(model_config,
                                       state_dict=checkpoint,
                                       vfeat_dim=VFEAT_DIM,
                                       max_frm_seq_len=max_frm_seq_len,
                                       lsr=model_opts.lsr)
    model.cuda()
    model = amp.initialize(model, enabled=opts.fp16, opt_level='O2')

    bos = toker.convert_tokens_to_ids(['<s>'])[0]
    eos = toker.convert_tokens_to_ids(['</s>'])[0]
    model.eval()
    generator = TvcGenerator(model, opts.max_gen_step, bos, eos, opts.fp16)
    results = decode(loader, generator, toker)
    save_jsonl(results, opts.output)

    # evaluate score if possible
    if (hvd.rank() == 0
            and 'descs' in json.loads(next(iter(open(opts.target_clip))))):
        evaluator = TVCEval(opts.target_clip)
        score = evaluator(results)
        print(score)
Exemplo n.º 5
0
def main(opts):
    if not exists(opts.output):
        os.makedirs(opts.output)
    else:
        raise ValueError('Found existing DB. Please explicitly remove '
                         'for re-processing')
    meta = vars(opts)
    meta['tokenizer'] = opts.toker
    toker = RobertaTokenizer.from_pretrained(
        opts.toker)
    tokenizer = roberta_tokenize(toker)
    meta['BOS'] = toker.convert_tokens_to_ids(['<s>'])[0]
    meta['EOS'] = toker.convert_tokens_to_ids(['</s>'])[0]
    meta['SEP'] = toker.convert_tokens_to_ids(['</s>'])[0]
    meta['CLS'] = toker.convert_tokens_to_ids(['<s>'])[0]
    meta['PAD'] = toker.convert_tokens_to_ids(['<pad>'])[0]
    meta['MASK'] = toker.convert_tokens_to_ids(['<mask>'])[0]
    meta['UNK'] = toker.convert_tokens_to_ids(['<unk>'])[0]
    meta['v_range'] = (toker.convert_tokens_to_ids(['.'])[0],
                       toker.convert_tokens_to_ids(['<|endoftext|>'])[0]+1)
    save_json(vars(opts), f'{opts.output}/meta.json', save_pretty=True)

    open_db = curry(open_lmdb, opts.output, readonly=False)
    with open_db() as db:
        with open(opts.annotation, "r") as ann:
            if opts.task == "tvr":
                id2lens, query2video, query_data = process_tvr(
                    ann, db, tokenizer)
            elif opts.task == "tvqa":
                id2lens, query2video, query_data = process_tvqa(
                    ann, db, tokenizer)
            elif opts.task == "violin":
                id2lens, query2video, query_data = process_violin(
                    ann, db, tokenizer)
            else:
                raise NotImplementedError(
                    f"prepro for {opts.task} not implemented")

    save_json(id2lens, f'{opts.output}/id2len.json')
    save_json(query2video, f'{opts.output}/query2video.json')
    save_jsonl(query_data, f'{opts.output}/query_data.jsonl')
Exemplo n.º 6
0
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    opts.rank = rank
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))

    if hvd.rank() != 0:
        LOGGER.disabled = True

    if opts.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
                         "should be >= 1".format(
                             opts.gradient_accumulation_steps))

    set_random_seed(opts.seed)
    opts.task = 'tvc'

    # train_examples = None
    LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, "
                f"{opts.vfeat_db}")
    video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db,
                                      opts.vfeat_interval, opts)

    # data loaders
    # train
    LOGGER.info(f"Loading train dataset {opts.train_db}")
    train_cap = CaptionTokLmdb(opts.train_db, opts.max_txt_len)
    train_dset = TvcTrainDataset(video_db, train_cap, opts.max_cap_per_vid)
    LOGGER.info(f"{sum(all_gather_list(len(train_dset)))} samples loaded")
    train_loader = build_dataloader(train_dset, opts.train_batch_size,
                                    TvcTrainDataset.collate, True, opts)

    # val
    LOGGER.info(f"Loading val dataset {opts.val_db}")
    val_cap = CaptionTokLmdb(opts.val_db, -1)
    val_dset = TvcValDataset(video_db, val_cap, -1)
    val_loader = build_dataloader(val_dset, opts.val_batch_size,
                                  TvcValDataset.collate, False, opts)
    if hvd.rank() == 0:
        evaluator = TVCEval(opts.val_ref)
    else:
        evaluator = NoOp()

    # Prepare model
    if opts.checkpoint:
        checkpoint = torch.load(opts.checkpoint)
    else:
        checkpoint = {}

    img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\
        ".position_embeddings.weight"
    if img_pos_embed_weight_key in checkpoint:
        max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key])
    else:
        max_frm_seq_len = MAX_FRM_SEQ_LEN

    model = HeroForTvc.from_pretrained(opts.model_config,
                                       state_dict=checkpoint,
                                       vfeat_dim=VFEAT_DIM,
                                       max_frm_seq_len=max_frm_seq_len,
                                       lsr=opts.lsr)

    model.to(device)
    # make sure every process has same model parameters in the beginning
    broadcast_tensors([p.data for p in model.parameters()], 0)
    set_dropout(model, opts.dropout)

    # Prepare optimizer
    optimizer = build_optimizer(model, opts)
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      enabled=opts.fp16,
                                      opt_level='O2')

    # assumes roberta tokenizer only
    if hvd.local_rank() == 0:
        # quick hack to prevent multi-process download collision
        toker = RobertaTokenizer.from_pretrained('roberta-base')
        all_gather_list(None)
    else:
        all_gather_list(None)
        toker = RobertaTokenizer.from_pretrained('roberta-base')
    bos = toker.convert_tokens_to_ids(['<s>'])[0]
    eos = toker.convert_tokens_to_ids(['</s>'])[0]
    generator = TvcGenerator(model, opts.max_gen_step, bos, eos, opts.fp16)

    global_step = 0
    if rank == 0:
        save_training_meta(opts)
        TB_LOGGER.create(join(opts.output_dir, 'log'))
        pbar = tqdm(total=opts.num_train_steps)
        model_saver = ModelSaver(join(opts.output_dir, 'ckpt'))
        os.makedirs(join(opts.output_dir, 'results'))  # store val predictions
        add_log_to_file(join(opts.output_dir, 'log', 'log.txt'))
    else:
        LOGGER.disabled = True
        pbar = NoOp()
        model_saver = NoOp()

    LOGGER.info(f"***** Running training with {n_gpu} GPUs *****")
    LOGGER.info("  Batch size = %d", opts.train_batch_size)
    LOGGER.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    LOGGER.info("  Num steps = %d", opts.num_train_steps)

    train_loss = RunningMeter('loss')
    n_vid = 0
    n_cap = 0
    n_epoch = 0
    start = time()
    # quick hack for amp delay_unscale bug
    optimizer.zero_grad()
    optimizer.step()
    model.train()
    while True:
        for step, batch in enumerate(train_loader):
            n_vid += opts.train_batch_size
            n_cap += batch['cap_input_ids'].size(0)

            loss = model(batch, compute_loss=True)
            loss = loss.mean()
            train_loss(loss.item())

            delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0
            with amp.scale_loss(loss, optimizer,
                                delay_unscale=delay_unscale) as scaled_loss:
                scaled_loss.backward()
                if not delay_unscale:
                    # gather gradients from every processes
                    # do this before unscaling to make sure every process uses
                    # the same gradient scale
                    grads = [
                        p.grad.data for p in model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    all_reduce_and_rescale_tensors(grads, float(1))

            if (step + 1) % opts.gradient_accumulation_steps == 0:
                global_step += 1

                # learning rate scheduling
                lr_this_step = get_lr_sched(global_step, opts)
                for i, param_group in enumerate(optimizer.param_groups):
                    if i == 0 or i == 1:
                        param_group['lr'] = lr_this_step * opts.lr_mul
                    elif i == 2 or i == 3:
                        param_group['lr'] = lr_this_step
                    else:
                        raise ValueError()
                TB_LOGGER.add_scalar('lr', lr_this_step, global_step)

                # log loss
                TB_LOGGER.add_scalar(train_loss.name, train_loss.val,
                                     global_step)
                TB_LOGGER.step()

                # update model params
                if opts.grad_norm != -1:
                    grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                                opts.grad_norm)
                    TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step)
                optimizer.step()
                optimizer.zero_grad()
                pbar.update(1)

                if global_step % 100 == 0:
                    # monitor training throughput
                    LOGGER.info('-------------------------------------------')
                    LOGGER.info(f'Step {global_step}:')
                    tot_vid = sum(all_gather_list(n_vid))
                    vid_per_sec = int(tot_vid / (time() - start))
                    LOGGER.info(f'{tot_vid} videos trained at '
                                f'{vid_per_sec} vid/s')
                    tot_cap = sum(all_gather_list(n_cap))
                    cap_per_sec = int(tot_cap / (time() - start))
                    TB_LOGGER.add_scalar(f'perf/vid_per_s', vid_per_sec,
                                         global_step)
                    TB_LOGGER.add_scalar(f'perf/cap_per_s', cap_per_sec,
                                         global_step)

                if global_step % opts.valid_steps == 0:
                    LOGGER.info('===========================================')
                    LOGGER.info(f"Step {global_step}: start validation")
                    val_log, results = validate(val_loader, generator, toker,
                                                evaluator)
                    if hvd.rank() == 0:
                        save_jsonl(
                            results, f"{opts.output_dir}/results/"
                            f"/results_{global_step}.jsonl")
                    TB_LOGGER.log_scaler_dict(val_log)
                    LOGGER.info('===========================================')
                    model_saver.save(model, global_step)
            if global_step >= opts.num_train_steps:
                break
        n_epoch += 1
        LOGGER.info(f"finished {n_epoch} epochs")
        if global_step >= opts.num_train_steps:
            break

    LOGGER.info('===========================================')
    if global_step % opts.valid_steps != 0:
        val_log, results = validate(val_loader, generator, toker, evaluator)
        if hvd.rank() == 0:
            save_jsonl(
                results, f"{opts.output_dir}/results/"
                f"/results_{global_step}.jsonl")
        TB_LOGGER.log_scaler_dict(val_log)
        model_saver.save(model, global_step)
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(description="translate.py")

    parser.add_argument("-eval_split_name", choices=["val", "test_public"])
    parser.add_argument("-eval_path", type=str, help="Path to eval data")
    parser.add_argument("-reference_path",
                        type=str,
                        default=None,
                        help="Path to reference")
    parser.add_argument("-res_dir",
                        required=True,
                        help="path to dir containing model .pt file")
    parser.add_argument("-batch_size",
                        type=int,
                        default=100,
                        help="batch size")

    # beam search configs
    parser.add_argument("-use_beam",
                        action="store_true",
                        help="use beam search, otherwise greedy search")
    parser.add_argument("-beam_size", type=int, default=2, help="beam size")
    parser.add_argument("-n_best",
                        type=int,
                        default=1,
                        help="stop searching when get n_best from beam search")
    parser.add_argument("-min_sen_len",
                        type=int,
                        default=8,
                        help="minimum length of the decoded sentences")
    parser.add_argument("-max_sen_len",
                        type=int,
                        default=25,
                        help="maximum length of the decoded sentences")
    parser.add_argument("-block_ngram_repeat",
                        type=int,
                        default=0,
                        help="block repetition of ngrams during decoding.")
    parser.add_argument("-length_penalty_name",
                        default="none",
                        choices=["none", "wu", "avg"],
                        help="length penalty to use.")
    parser.add_argument(
        "-length_penalty_alpha",
        type=float,
        default=0.,
        help="Google NMT length penalty parameter (higher = longer generation)"
    )

    parser.add_argument("-no_cuda", action="store_true")
    parser.add_argument("-seed", default=2019, type=int)
    parser.add_argument("-debug", action="store_true")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # random seed
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    checkpoint = torch.load(os.path.join(opt.res_dir, "model.chkpt"))

    decoding_strategy = "beam{}_lp_{}_la_{}".format(
        opt.beam_size, opt.length_penalty_name,
        opt.length_penalty_alpha) if opt.use_beam else "greedy"
    save_json(vars(opt),
              os.path.join(opt.res_dir,
                           "{}_eval_cfg.json".format(decoding_strategy)),
              save_pretty=True)

    # add some of the train configs
    train_opt = checkpoint[
        "opt"]  # EDict(load_json(os.path.join(opt.res_dir, "model.cfg.json")))
    for k in train_opt.__dict__:
        if k not in opt.__dict__:
            setattr(opt, k, getattr(train_opt, k))

    if "ctx_mode" not in opt:
        opt.ctx_mode = "video_sub"  # temp hack, since the first experiment does not have such a setting

    eval_data_loader = get_data_loader(opt)

    # setup model
    translator = Translator(opt, checkpoint)

    pred_file = os.path.join(
        opt.res_dir, "{}_pred_{}.jsonl".format(decoding_strategy,
                                               opt.eval_split_name))
    pred_file = os.path.abspath(pred_file)
    if not os.path.exists(pred_file):
        json_res = run_translate(eval_data_loader, translator, opt=opt)
        save_jsonl(json_res, pred_file)
    else:
        print("Using existing prediction file at {}".format(pred_file))

    if opt.reference_path:
        # COCO language evaluation
        reference_path = os.path.abspath(opt.reference_path)
        metrics_path = pred_file.replace(".json", "_lang_metrics.json")
        eval_cmd = [
            "python", "evaluate.py", "-s", pred_file, "-o", metrics_path, "-r",
            reference_path
        ]
        subprocess.call(eval_cmd, cwd="standalone_eval")

    print("[Info] Finished {}.".format(opt.eval_split_name))