Exemplo n.º 1
0
    def __init__(self, config_path, model_path, model_type):
        print(config_path)
        print(model_path)
        print(model_type)
        self.model_type = model_type
        configs = prepare_configs(config_path)

        data_configs = configs['data_configs']
        model_configs = configs['model_configs']

        vocab_src = Vocabulary.build_from_file(
            **data_configs['vocabularies'][0])
        vocab_tgt = Vocabulary.build_from_file(
            **data_configs['vocabularies'][1])

        nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                                n_tgt_vocab=vocab_tgt.max_n_words,
                                padding_idx=vocab_src.pad,
                                **model_configs)

        params = load_model_parameters(model_path, map_location="cpu")
        nmt_model.load_state_dict(params)
        nmt_model.cuda()
        nmt_model.eval()

        self.model = nmt_model
        self.data_configs = data_configs
        self.model_configs = model_configs
        self.vocab_src = vocab_src
        self.vocab_tgt = vocab_tgt
Exemplo n.º 2
0
def get_avg_UNK_dist(config_path, model_path, batch_size=50, reload=True):
    """
    get average UNK distance
    :param config_path: the configuration to victim model for embedding construction
    :param model_path: the model parameter path
    :return:
    """
    # load configs
    with open(config_path.strip()) as f:
        configs = yaml.load(f, Loader=yaml.FullLoader)
    data_configs = configs["data_configs"]
    model_configs = configs["model_configs"]
    # load vocabulary file
    src_vocab = Vocabulary(**data_configs["vocabularies"][0])

    # load embedding from model
    emb = nn.Embedding(num_embeddings=src_vocab.max_n_words,
                       embedding_dim=model_configs["d_word_vec"],
                       padding_idx=PAD)
    model_params = torch.load(model_path, map_location="cpu")
    emb.load_state_dict(
        {
            "weight":
            model_params["model"]["encoder.embeddings.embeddings.weight"]
        },
        strict=True)
    # len_mat = torch.sum(emb.weight**2, dim=1)**0.5  # length of the embeddings

    # get max range of UNK to any of the embeddings.
    subtraction = emb.weight - emb.weight[UNK]
    # len_mat = torch.sum(subtraction**2, dim=1)**0.5

    return torch.abs(subtraction).max()
Exemplo n.º 3
0
    def __init__(self, vocab_file, corpus_dir, video_path, phase, DEBUG=False):
        """
        :param phase:  'train', 'dev', 'test'
        """
        self.vocab_file = vocab_file
        self.image_type = 'png'
        self.max_video_len = 300
        self.corpus_dir = corpus_dir
        self.video_path = video_path
        self.phase = phase
        self.sample = True
        self.input_shape = 112

        self.alignment = {}
        self.vocab = Vocabulary(self.vocab_file)

        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])

        self.transform = transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.RandomCrop(self.input_shape),
            transforms.ToTensor(),
            normalize,
        ])
        self.test_transform = transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.CenterCrop(self.input_shape),
            transforms.ToTensor(),
            normalize,
        ])

        self.phoenix_dataset = self.load_video_list()
        self.data_dict = self.phoenix_dataset[phase]
        if DEBUG == True:
            self.data_dict = self.data_dict[:101]

        logging.info('[DATASET: {:s}]: total {:d} samples.'.format(
            phase, len(self.data_dict)))
Exemplo n.º 4
0
def train(config_path, model_path, model_type, src_filename, trg_filename):
    """
    flags:
        saveto: str
        reload: store_true
        config_path: str
        pretrain_path: str, default=""
        model_name: str
        log_path: str
    """

    # ================================================================================== #
    # Initialization for training on different devices
    # - CPU/GPU
    # - Single/Distributed
    Constants.USE_GPU = True
    print(config_path)
    print(model_path)
    print(model_type)

    world_size = 1
    rank = 0
    local_rank = 0

    if Constants.USE_GPU:
        torch.cuda.set_device(local_rank)
        Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank)
    else:
        Constants.CURRENT_DEVICE = "cpu"

    # ================================================================================== #
    # Parsing configuration files
    # - Load default settings
    # - Load pre-defined settings
    # - Load user-defined settings

    configs = prepare_configs(config_path)

    data_configs = configs['data_configs']
    model_configs = configs['model_configs']
    training_configs = configs['training_configs']

    INFO(pretty_configs(configs))

    Constants.SEED = training_configs['seed']
    set_seed(Constants.SEED)
    timer = Timer()

    # ================================================================================== #
    # Load Data

    INFO('Loading data...')
    timer.tic()

    # Generate target dictionary
    vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0])
    vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1])

    Constants.EOS = vocab_src.eos
    Constants.PAD = vocab_src.pad
    Constants.BOS = vocab_src.bos

    valid_bitext_dataset = ZipDataset(
        TextLineDataset(
            data_path=src_filename,
            vocabulary=vocab_src,
            max_len=100,
            is_train_dataset=False,
        ),
        TextLineDataset(
            data_path=trg_filename,
            vocabulary=vocab_tgt,
            is_train_dataset=False,
            max_len=100,
        ))

    valid_iterator = DataIterator(dataset=valid_bitext_dataset,
                                  batch_size=20,
                                  use_bucket=training_configs['use_bucket'],
                                  buffer_size=training_configs['buffer_size'],
                                  numbering=True,
                                  world_size=world_size,
                                  rank=rank)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # ================================ Begin ======================================== #
    # Build Model & Optimizer
    # We would do steps below on after another
    #     1. build models & criterion
    #     2. move models & criterion to gpu if needed
    #     3. load pre-trained model if needed
    #     4. build optimizer
    #     5. build learning rate scheduler if needed
    #     6. load checkpoints if needed

    # 0. Initial

    # 1. Build Model & Criterion
    INFO('Building model...')
    timer.tic()
    nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                            n_tgt_vocab=vocab_tgt.max_n_words,
                            padding_idx=vocab_src.pad,
                            vocab_src=vocab_src,
                            **model_configs)
    INFO(nmt_model)

    # 2. Move to GPU
    if Constants.USE_GPU:
        nmt_model = nmt_model.cuda()

    # 3. Load pretrained model if needed
    load_pretrained_model(nmt_model,
                          model_path,
                          device=Constants.CURRENT_DEVICE)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # ================================================================================== #
    # Prepare training

    sent_per_sec_meter = TimeMeter()
    tok_per_sec_meter = TimeMeter()

    grad_denom = 0
    train_loss = 0.0
    cum_n_words = 0
    valid_loss = best_valid_loss = float('inf')

    sent_per_sec_meter.start()
    tok_per_sec_meter.start()

    INFO('Begin training...')
    eidx = 0
    uidx = 0
    score_result = dict()

    # Build iterator and progress bar
    training_iter = valid_iterator.build_generator()

    training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format(
        eidx, uidx),
                                 total=len(valid_iterator),
                                 unit="sents")

    for batch in training_iter:
        seqs_numbers, seqs_x, seqs_y = batch

        batch_size = len(seqs_x)
        cum_n_words += sum(len(s) for s in seqs_y)

        try:
            # Prepare data
            x, y = prepare_data(seqs_x, seqs_y, cuda=Constants.USE_GPU)

            y_inp = y[:, :-1].contiguous()
            y_label = y[:, 1:].contiguous()  # [batch_size, seq_len]
            log_probs = nmt_model(
                x, y_inp, log_probs=True)  # [batch_size, seq_len, vocab_size]

            _, seq_len = y_label.shape
            log_probs = log_probs.view(-1, vocab_tgt.max_n_words)
            y_label = y_label.view(-1)
            loss = F.nll_loss(log_probs,
                              y_label,
                              reduce=False,
                              ignore_index=vocab_tgt.pad)
            loss = loss.view(batch_size, seq_len)
            loss = loss.sum(-1)

            y_label = y_label.view(batch_size, seq_len)
            valid_token = (y_label != vocab_tgt.pad).sum(-1)
            loss = loss.double().div(valid_token.double())
            for seq_num, l in zip(seqs_numbers, loss):
                assert seq_num not in score_result
                score_result.update({seq_num: l.item()})

            uidx += 1
            grad_denom += batch_size

        except RuntimeError as e:
            if 'out of memory' in str(e):
                print('| WARNING: ran out of memory, skipping batch')
            else:
                raise e

        if training_progress_bar is not None:
            training_progress_bar.update(batch_size)
            training_progress_bar.set_description(
                ' - (Epc {}, Upd {}) '.format(eidx, uidx))

            postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format(
                train_loss, valid_loss, best_valid_loss)
            training_progress_bar.set_postfix_str(postfix_str)

    training_progress_bar.close()
    return score_result
Exemplo n.º 5
0
def test_data(flags):
    Constants.USE_GPU = flags.use_gpu

    world_size = 1
    rank = 0
    local_rank = 0

    if Constants.USE_GPU:
        torch.cuda.set_device(local_rank)
        Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank)
    else:
        Constants.CURRENT_DEVICE = "cpu"

    # ================================================================================== #
    # Parsing configuration files
    # - Load default settings
    # - Load pre-defined settings
    # - Load user-defined settings

    configs = prepare_configs(flags.config_path, flags.predefined_config)

    data_configs = configs['data_configs']
    model_configs = configs['model_configs']
    training_configs = configs['training_configs']
    bt_configs = configs['bt_configs'] if 'bt_configs' in configs else None
    if bt_configs is not None:
        print("btconfigs ", bt_configs)
        if 'bt_attribute_data' not in bt_configs:
            Constants.USE_BT = False
            bt_configs = None
        else:
            Constants.USE_BT = True
            Constants.USE_BTTAG = bt_configs['use_bttag']
            Constants.USE_CONFIDENCE = bt_configs['use_confidence']
    INFO(pretty_configs(configs))

    Constants.SEED = training_configs['seed']

    set_seed(Constants.SEED)

    timer = Timer()

    # ================================================================================== #
    # Load Data
    INFO('Loading data...')
    timer.tic()

    # Generate target dictionary
    vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0])
    vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1])

    Constants.EOS = vocab_src.eos
    Constants.PAD = vocab_src.pad
    Constants.BOS = vocab_src.bos

    valid_bitext_dataset = ZipDataset(
        TextLineDataset(data_path=data_configs['valid_data'][0],
                        vocabulary=vocab_src,
                        is_train_dataset=False,
                        ),
        TextLineDataset(data_path=data_configs['valid_data'][1],
                        vocabulary=vocab_tgt,
                        is_train_dataset=False
                        )
    )

    valid_iterator = DataIterator(dataset=valid_bitext_dataset,
                                  batch_size=training_configs['valid_batch_size'],
                                  use_bucket=True, buffer_size=100000, numbering=True,
                                  world_size=world_size, rank=rank, shuffle=False)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # ================================ Begin ======================================== #
    # Build Model & Optimizer
    # We would do steps below on after another
    #     1. build models & criterion
    #     2. move models & criterion to gpu if needed
    #     3. load pre-trained model if needed
    #     4. build optimizer
    #     5. build learning rate scheduler if needed
    #     6. load checkpoints if needed

    # 0. Initial
    # 1. Build Model & Criterion
    INFO('Building model...')
    timer.tic()
    nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                            n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src,
                            vocab_tgt=vocab_tgt,
                            **model_configs)
    INFO(nmt_model)

    # 2. Move to GPU
    if Constants.USE_GPU:
        nmt_model = nmt_model.cuda()

    # 3. Load pretrained model if needed
    load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=flags.pretrain_exclude_prefix,
                          device=Constants.CURRENT_DEVICE)
    nmt_model = nmt_model.encoder
    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    INFO('Begin training...')
    # 计算train集合每个句子的表示:mean pool
    nmt_model.eval()

    # 计算test集合每个句子的表示: mean pool
    valid_iter = valid_iterator.build_generator()
    all_seq_numbers = []

    all_mean_encoder_hidden = None
    for batch in valid_iter:
        bt_attrib = None
        seq_numbers, seqs_x, seqs_y = batch
        all_seq_numbers.extend(seq_numbers)
        x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU, bt_attrib=bt_attrib)
        try:
            with torch.no_grad():
                encoder_hidden, mask = nmt_model(x)
        except RuntimeError as e:
            if 'out of memory' in str(e):
                print('| WARNING: ran out of memory, skipping batch')
            else:
                raise e

        valid_hidden = (mask == False).float().cuda()
        sum_encoder_hidden = (encoder_hidden * valid_hidden.unsqueeze(-1)).sum(dim=1)
        valid_tokens = (mask == False).sum(-1)
        mean_encoder_hidden = sum_encoder_hidden.float().div(valid_tokens.unsqueeze(1))

        if all_mean_encoder_hidden is None:
            all_mean_encoder_hidden = mean_encoder_hidden
        else:
            all_mean_encoder_hidden = torch.cat((all_mean_encoder_hidden, mean_encoder_hidden), dim=0)
    return all_mean_encoder_hidden, all_seq_numbers
Exemplo n.º 6
0
def train2(flags):
    """
    flags:
        saveto: str
        reload: store_true
        config_path: str
        pretrain_path: str, default=""
        model_name: str
        log_path: str
    """

    # ================================================================================== #
    # Initialization for training on different devices
    # - CPU/GPU
    # - Single/Distributed
    Constants.USE_GPU = flags.use_gpu

    world_size = 1
    rank = 0
    local_rank = 0

    if Constants.USE_GPU:
        torch.cuda.set_device(local_rank)
        Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank)
    else:
        Constants.CURRENT_DEVICE = "cpu"

    # ================================================================================== #
    # Parsing configuration files
    # - Load default settings
    # - Load pre-defined settings
    # - Load user-defined settings

    configs = prepare_configs(flags.config_path, flags.predefined_config)

    data_configs = configs['data_configs']
    model_configs = configs['model_configs']
    training_configs = configs['training_configs']
    bt_configs = configs['bt_configs'] if 'bt_configs' in configs else None
    if bt_configs is not None:
        print("btconfigs ", bt_configs)
        if 'bt_attribute_data' not in bt_configs:
            Constants.USE_BT = False
            bt_configs = None
        else:
            Constants.USE_BT = True
            Constants.USE_BTTAG = bt_configs['use_bttag']
            Constants.USE_CONFIDENCE = bt_configs['use_confidence']
    INFO(pretty_configs(configs))

    Constants.SEED = training_configs['seed']

    set_seed(Constants.SEED)

    timer = Timer()

    # ================================================================================== #
    # Load Data
    INFO('Loading data...')
    timer.tic()

    # Generate target dictionary
    vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0])
    vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1])

    Constants.EOS = vocab_src.eos
    Constants.PAD = vocab_src.pad
    Constants.BOS = vocab_src.bos

    # bt tag dataset
    if Constants.USE_BT:
        if Constants.USE_BTTAG:
            Constants.BTTAG = vocab_src.bttag
        train_bitext_dataset = ZipDataset(
            TextLineDataset(data_path=data_configs['train_data'][0],
                            vocabulary=vocab_src,
                            max_len=data_configs['max_len'][0],
                            is_train_dataset=True
                            ),
            TextLineDataset(data_path=data_configs['train_data'][1],
                            vocabulary=vocab_tgt,
                            max_len=data_configs['max_len'][1],
                            is_train_dataset=True
                            ),
            AttributeDataset(data_path=bt_configs['bt_attribute_data'], is_train_dataset=True)
        )
    else:
        train_bitext_dataset = ZipDataset(
            TextLineDataset(data_path=data_configs['train_data'][0],
                            vocabulary=vocab_src,
                            max_len=data_configs['max_len'][0],
                            is_train_dataset=True
                            ),
            TextLineDataset(data_path=data_configs['train_data'][1],
                            vocabulary=vocab_tgt,
                            max_len=data_configs['max_len'][1],
                            is_train_dataset=True
                            )
        )

    training_iterator = DataIterator(dataset=train_bitext_dataset,
                                     batch_size=training_configs["batch_size"],
                                     use_bucket=training_configs['use_bucket'],
                                     buffer_size=training_configs['buffer_size'],
                                     batching_func=training_configs['batching_key'],
                                     world_size=world_size, numbering=True,
                                     rank=rank)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # ================================ Begin ======================================== #
    # Build Model & Optimizer
    # We would do steps below on after another
    #     1. build models & criterion
    #     2. move models & criterion to gpu if needed
    #     3. load pre-trained model if needed
    #     4. build optimizer
    #     5. build learning rate scheduler if needed
    #     6. load checkpoints if needed

    # 0. Initial
    # 1. Build Model & Criterion
    INFO('Building model...')
    timer.tic()
    nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                            n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src,
                            vocab_tgt=vocab_tgt,
                            **model_configs)
    INFO(nmt_model)

    # 2. Move to GPU
    if Constants.USE_GPU:
        nmt_model = nmt_model.cuda()

    # 3. Load pretrained model if needed
    load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=flags.pretrain_exclude_prefix,
                          device=Constants.CURRENT_DEVICE)
    nmt_model = nmt_model.encoder
    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    INFO('Begin training...')

    # 计算train集合每个句子的表示:mean pool
    training_iter = training_iterator.build_generator()
    nmt_model.eval()

    all_seq_numbers = []
    encoder_filename = "/home/wangdq/encoder.mean.output"
    seq_numbers_filename = '/home/wangdq/seq_numbers.output'

    processd = 0

    with open(encoder_filename, 'w') as f_encoder, open(seq_numbers_filename, 'w') as f_seq_numbers:

        for batch in training_iter:
            bt_attrib = None
            # bt attrib data
            if Constants.USE_BT:
                seq_numbers, seqs_x, seqs_y, bt_attrib = batch  # seq_numerbs从0开始编号
            else:
                seq_numbers, seqs_x, seqs_y = batch

            x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU, bt_attrib=bt_attrib)

            try:
                with torch.no_grad():
                    encoder_hidden, mask = nmt_model(x)
            except RuntimeError as e:
                if 'out of memory' in str(e):
                    print('| WARNING: ran out of memory, skipping batch')
                else:
                    raise e

            valid_hidden = (mask == False).float().cuda()
            sum_encoder_hidden = (encoder_hidden * valid_hidden.unsqueeze(-1)).sum(dim=1)
            valid_tokens = (mask == False).sum(-1)
            mean_encoder_hidden = sum_encoder_hidden.float().div(valid_tokens.unsqueeze(1))

            all_seq_numbers.extend(seq_numbers)
            # if all_mean_encoder_hidden is None:
            #     all_mean_encoder_hidden = mean_encoder_hidden.cpu()
            # else:
            #     all_mean_encoder_hidden = torch.cat((all_mean_encoder_hidden, mean_encoder_hidden.cpu()), dim=0)

            mean_encoder_list = mean_encoder_hidden.cpu().numpy().tolist()
            content = [[str(i) for i in mean] for mean in mean_encoder_list]
            content = [' '.join(mean) + '\n' for mean in content]
            f_encoder.writelines(content)

            processd += len(seq_numbers)
            print(processd)

        content = [str(i) for i in all_seq_numbers]
        content = ' '.join(content)
        f_seq_numbers.writelines(content)
Exemplo n.º 7
0
    def __init__(self,
                 n_src_vocab,
                 n_tgt_vocab,
                 n_layers=6,
                 n_head=8,
                 d_word_vec=512,
                 d_model=512,
                 d_inner_hid=1024,
                 dim_per_head=None,
                 dropout=0.1,
                 tie_input_output_embedding=True,
                 tie_source_target_embedding=False,
                 padding_idx=PAD,
                 layer_norm_first=True,
                 positional_embedding="sin",
                 generator_bias=False,
                 ffn_activation="relu",
                 vocab_src=None,
                 **kwargs):

        super(Transformer_Char, self).__init__()

        self.char_vocab = Vocabulary.build_from_file(**kwargs['char_vocab'])

        self.encoder = Encoder(n_src_vocab,
                               char_src_vocab=self.char_vocab.max_n_words,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner_hid=d_inner_hid,
                               dropout=dropout,
                               dim_per_head=dim_per_head,
                               padding_idx=padding_idx,
                               layer_norm_first=layer_norm_first,
                               positional_embedding=positional_embedding,
                               ffn_activation=ffn_activation)

        self.decoder = Decoder(
            n_tgt_vocab,
            n_layers=n_layers,
            n_head=n_head,
            d_word_vec=d_word_vec,
            d_model=d_model,
            d_inner_hid=d_inner_hid,
            dropout=dropout,
            dim_per_head=dim_per_head,
            padding_idx=padding_idx,
            layer_norm_first=layer_norm_first,
            positional_embedding=positional_embedding,
            ffn_activation=ffn_activation,
        )

        self.dropout = nn.Dropout(dropout)

        assert d_model == d_word_vec, \
            'To facilitate the residual connections, \
             the dimensions of all module output shall be the same.'

        if tie_source_target_embedding:
            assert n_src_vocab == n_tgt_vocab, \
                "source and target vocabulary should have equal size when tying source&target embedding"
            self.encoder.embeddings.embeddings.weight = self.decoder.embeddings.embeddings.weight

        if tie_input_output_embedding:
            self.generator = Generator(
                n_words=n_tgt_vocab,
                hidden_size=d_word_vec,
                shared_weight=self.decoder.embeddings.embeddings.weight,
                padding_idx=PAD,
                add_bias=generator_bias)

        else:
            self.generator = Generator(n_words=n_tgt_vocab,
                                       hidden_size=d_word_vec,
                                       padding_idx=PAD,
                                       add_bias=generator_bias)

        self.bpe_vocab = vocab_src
Exemplo n.º 8
0
def train(flags):
    """
    flags:
        saveto: str
        reload: store_true
        config_path: str
        pretrain_path: str, default=""
        model_name: str
        log_path: str
    """

    # ================================================================================== #
    # Initialization for training on different devices
    # - CPU/GPU
    # - Single/Distributed
    Constants.USE_GPU = flags.use_gpu

    if flags.multi_gpu:
        dist.distributed_init(flags.shared_dir)
        world_size = dist.get_world_size()
        rank = dist.get_rank()
        local_rank = dist.get_local_rank()
    else:
        world_size = 1
        rank = 0
        local_rank = 0

    if Constants.USE_GPU:
        torch.cuda.set_device(local_rank)
        Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank)
    else:
        Constants.CURRENT_DEVICE = "cpu"

    # If not root_rank, close logging
    # else write log of training to file.
    if rank == 0:
        write_log_to_file(
            os.path.join(flags.log_path,
                         "%s.log" % time.strftime("%Y%m%d-%H%M%S")))
    else:
        close_logging()

    # ================================================================================== #
    # Parsing configuration files
    # - Load default settings
    # - Load pre-defined settings
    # - Load user-defined settings

    configs = prepare_configs(flags.config_path, flags.predefined_config)

    data_configs = configs['data_configs']
    model_configs = configs['model_configs']
    optimizer_configs = configs['optimizer_configs']
    training_configs = configs['training_configs']

    INFO(pretty_configs(configs))

    # use odc
    if training_configs['use_odc'] is True:
        ave_best_k = check_odc_config(training_configs)
    else:
        ave_best_k = 0

    Constants.SEED = training_configs['seed']

    set_seed(Constants.SEED)

    timer = Timer()

    # ================================================================================== #
    # Load Data

    INFO('Loading data...')
    timer.tic()

    # Generate target dictionary
    vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0])
    vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1])

    Constants.EOS = vocab_src.eos
    Constants.PAD = vocab_src.pad
    Constants.BOS = vocab_src.bos

    train_bitext_dataset = ZipDataset(
        TextLineDataset(data_path=data_configs['train_data'][0],
                        vocabulary=vocab_src,
                        max_len=data_configs['max_len'][0],
                        is_train_dataset=True),
        TextLineDataset(data_path=data_configs['train_data'][1],
                        vocabulary=vocab_tgt,
                        max_len=data_configs['max_len'][1],
                        is_train_dataset=True))

    valid_bitext_dataset = ZipDataset(
        TextLineDataset(
            data_path=data_configs['valid_data'][0],
            vocabulary=vocab_src,
            is_train_dataset=False,
        ),
        TextLineDataset(data_path=data_configs['valid_data'][1],
                        vocabulary=vocab_tgt,
                        is_train_dataset=False))

    training_iterator = DataIterator(
        dataset=train_bitext_dataset,
        batch_size=training_configs["batch_size"],
        use_bucket=training_configs['use_bucket'],
        buffer_size=training_configs['buffer_size'],
        batching_func=training_configs['batching_key'],
        world_size=world_size,
        rank=rank)

    valid_iterator = DataIterator(
        dataset=valid_bitext_dataset,
        batch_size=training_configs['valid_batch_size'],
        use_bucket=True,
        buffer_size=100000,
        numbering=True,
        world_size=world_size,
        rank=rank)

    bleu_scorer = SacreBLEUScorer(
        reference_path=data_configs["bleu_valid_reference"],
        num_refs=data_configs["num_refs"],
        lang_pair=data_configs["lang_pair"],
        sacrebleu_args=training_configs["bleu_valid_configs"]
        ['sacrebleu_args'],
        postprocess=training_configs["bleu_valid_configs"]['postprocess'])

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # ================================ Begin ======================================== #
    # Build Model & Optimizer
    # We would do steps below on after another
    #     1. build models & criterion
    #     2. move models & criterion to gpu if needed
    #     3. load pre-trained model if needed
    #     4. build optimizer
    #     5. build learning rate scheduler if needed
    #     6. load checkpoints if needed

    # 0. Initial

    lrate = optimizer_configs['learning_rate']
    model_collections = Collections()

    checkpoint_saver = Saver(
        save_prefix="{0}.ckpt".format(
            os.path.join(flags.saveto, flags.model_name)),
        num_max_keeping=training_configs['num_kept_checkpoints'])

    best_model_prefix = os.path.join(
        flags.saveto, flags.model_name + Constants.MY_BEST_MODEL_SUFFIX)

    best_k_saver = BestKSaver(
        save_prefix="{0}.best_k_ckpt".format(
            os.path.join(flags.saveto, flags.model_name)),
        num_max_keeping=training_configs['num_kept_best_k_checkpoints'])

    # 1. Build Model & Criterion
    INFO('Building model...')
    timer.tic()
    nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                            n_tgt_vocab=vocab_tgt.max_n_words,
                            padding_idx=vocab_src.pad,
                            vocab_src=vocab_src,
                            **model_configs)
    INFO(nmt_model)

    # build teacher model
    teacher_model, teacher_model_path = get_teacher_model(
        training_configs, model_configs, vocab_src, vocab_tgt, flags)

    # build critic
    critic = CombinationCriterion(model_configs['loss_configs'],
                                  padding_idx=vocab_tgt.pad,
                                  teacher=teacher_model)
    # INFO(critic)
    critic.INFO()

    # 2. Move to GPU
    if Constants.USE_GPU:
        nmt_model = nmt_model.cuda()
        critic = critic.cuda()

    # 3. Load pretrained model if needed
    load_pretrained_model(nmt_model,
                          flags.pretrain_path,
                          exclude_prefix=None,
                          device=Constants.CURRENT_DEVICE)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # 4. Build optimizer
    INFO('Building Optimizer...')

    if not flags.multi_gpu:
        optim = Optimizer(name=optimizer_configs['optimizer'],
                          model=nmt_model,
                          lr=lrate,
                          grad_clip=optimizer_configs['grad_clip'],
                          optim_args=optimizer_configs['optimizer_params'],
                          update_cycle=training_configs['update_cycle'])
    else:
        optim = dist.DistributedOptimizer(
            name=optimizer_configs['optimizer'],
            model=nmt_model,
            lr=lrate,
            grad_clip=optimizer_configs['grad_clip'],
            optim_args=optimizer_configs['optimizer_params'],
            device_id=local_rank)

    # 5. Build scheduler for optimizer if needed
    scheduler = build_scheduler(
        schedule_method=optimizer_configs['schedule_method'],
        optimizer=optim,
        scheduler_configs=optimizer_configs['scheduler_configs'])

    # 6. build moving average
    ma = build_ma(training_configs, nmt_model.named_parameters())

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # Reload from latest checkpoint
    if flags.reload:
        checkpoint_saver.load_latest(model=nmt_model,
                                     optim=optim,
                                     lr_scheduler=scheduler,
                                     collections=model_collections,
                                     ma=ma,
                                     device=Constants.CURRENT_DEVICE)

    # broadcast parameters and optimizer states
    if world_size > 1:
        INFO("Broadcasting model parameters...")
        dist.broadcast_parameters(params=nmt_model.state_dict())
        INFO("Broadcasting optimizer states...")
        dist.broadcast_optimizer_state(optimizer=optim.optim)
        INFO('Done.')

    # ================================================================================== #
    # Prepare training

    eidx = model_collections.get_collection("eidx", [0])[-1]
    uidx = model_collections.get_collection("uidx", [1])[-1]
    bad_count = model_collections.get_collection("bad_count", [0])[-1]
    oom_count = model_collections.get_collection("oom_count", [0])[-1]
    is_early_stop = model_collections.get_collection("is_early_stop", [
        False,
    ])[-1]
    teacher_patience = model_collections.get_collection(
        "teacher_patience", [training_configs['teacher_patience']])[-1]

    train_loss_meter = AverageMeter()
    train_loss_dict_meter = AverageMeterDict(critic.get_critic_name())
    sent_per_sec_meter = TimeMeter()
    tok_per_sec_meter = TimeMeter()

    update_cycle = training_configs['update_cycle']
    grad_denom = 0
    train_loss = 0.0
    cum_n_words = 0
    train_loss_dict = dict()
    valid_loss = best_valid_loss = float('inf')

    if rank == 0:
        summary_writer = SummaryWriter(log_dir=flags.log_path)
    else:
        summary_writer = None

    sent_per_sec_meter.start()
    tok_per_sec_meter.start()

    INFO('Begin training...')

    while True:

        if summary_writer is not None:
            summary_writer.add_scalar("Epoch", (eidx + 1), uidx)

        # Build iterator and progress bar
        training_iter = training_iterator.build_generator()

        if rank == 0:
            training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format(
                eidx, uidx),
                                         total=len(training_iterator),
                                         unit="sents")
        else:
            training_progress_bar = None

        for batch in training_iter:

            seqs_x, seqs_y = batch

            batch_size = len(seqs_x)
            cum_n_words += sum(len(s) for s in seqs_y)

            try:
                # Prepare data
                x, y = prepare_data(seqs_x, seqs_y, cuda=Constants.USE_GPU)

                loss, loss_dict = compute_forward(
                    model=nmt_model,
                    critic=critic,
                    seqs_x=x,
                    seqs_y=y,
                    eval=False,
                    normalization=1.0,
                    norm_by_words=training_configs["norm_by_words"])

                update_cycle -= 1
                grad_denom += batch_size
                train_loss += loss
                train_loss_dict = add_dict_value(train_loss_dict, loss_dict)

            except RuntimeError as e:
                if 'out of memory' in str(e):
                    print('| WARNING: ran out of memory, skipping batch')
                    oom_count += 1
                else:
                    raise e

            # When update_cycle becomes 0, it means end of one batch. Several things will be done:
            # - update parameters
            # - reset update_cycle and grad_denom, update uidx
            # - learning rate scheduling
            # - update moving average

            if update_cycle == 0:

                # 0. reduce variables
                if world_size > 1:
                    grad_denom = dist.all_reduce_py(grad_denom)
                    train_loss = dist.all_reduce_py(train_loss)
                    train_loss_dict = dist.all_reduce_py(train_loss_dict)
                    cum_n_words = dist.all_reduce_py(cum_n_words)

                # 1. update parameters
                optim.step(denom=grad_denom)
                optim.zero_grad()

                if training_progress_bar is not None:
                    training_progress_bar.update(grad_denom)
                    training_progress_bar.set_description(
                        ' - (Epc {}, Upd {}) '.format(eidx, uidx))

                    postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format(
                        train_loss, valid_loss, best_valid_loss)
                    for critic_name, loss_value in train_loss_dict.items():
                        postfix_str += (critic_name +
                                        ': {:.2f}, ').format(loss_value)
                    training_progress_bar.set_postfix_str(postfix_str)

                # 2. learning rate scheduling
                if scheduler is not None and optimizer_configs[
                        "schedule_method"] != "loss":
                    scheduler.step(global_step=uidx)

                # 3. update moving average
                if ma is not None and eidx >= training_configs[
                        'moving_average_start_epoch']:
                    ma.step()

                # 4. update meters
                train_loss_meter.update(train_loss, grad_denom)
                train_loss_dict_meter.update(train_loss_dict, grad_denom)
                sent_per_sec_meter.update(grad_denom)
                tok_per_sec_meter.update(cum_n_words)

                # 5. reset accumulated variables, update uidx
                update_cycle = training_configs['update_cycle']
                grad_denom = 0
                uidx += 1
                cum_n_words = 0.0
                train_loss = 0.0
                train_loss_dict = dict()

            else:
                continue

            # ================================================================================== #
            # Display some information
            if should_trigger_by_steps(
                    uidx, eidx, every_n_step=training_configs['disp_freq']):

                lrate = list(optim.get_lrate())[0]

                if summary_writer is not None:
                    summary_writer.add_scalar(
                        "Speed(sents/sec)",
                        scalar_value=sent_per_sec_meter.ave,
                        global_step=uidx)
                    summary_writer.add_scalar(
                        "Speed(words/sec)",
                        scalar_value=tok_per_sec_meter.ave,
                        global_step=uidx)
                    summary_writer.add_scalar(
                        "train_loss",
                        scalar_value=train_loss_meter.ave,
                        global_step=uidx)
                    # add loss for every critic
                    if flags.display_loss_detail:
                        combination_loss = train_loss_dict_meter.value
                        for key, value in combination_loss.items():
                            summary_writer.add_scalar(key,
                                                      scalar_value=value,
                                                      global_step=uidx)
                    summary_writer.add_scalar("lrate",
                                              scalar_value=lrate,
                                              global_step=uidx)
                    summary_writer.add_scalar("oom_count",
                                              scalar_value=oom_count,
                                              global_step=uidx)

                # Reset Meters
                sent_per_sec_meter.reset()
                tok_per_sec_meter.reset()
                train_loss_meter.reset()
                train_loss_dict_meter.reset()

            # ================================================================================== #
            # Loss Validation & Learning rate annealing
            if should_trigger_by_steps(
                    global_step=uidx,
                    n_epoch=eidx,
                    every_n_step=training_configs['loss_valid_freq'],
                    debug=flags.debug):
                with cache_parameters(nmt_model):

                    valid_loss, valid_loss_dict = loss_evaluation(
                        model=nmt_model,
                        critic=critic,
                        valid_iterator=valid_iterator,
                        rank=rank,
                        world_size=world_size)

                if scheduler is not None and optimizer_configs[
                        "schedule_method"] == "loss":
                    scheduler.step(metric=valid_loss)

                model_collections.add_to_collection("history_losses",
                                                    valid_loss)

                min_history_loss = np.array(
                    model_collections.get_collection("history_losses")).min()
                best_valid_loss = min_history_loss

                if summary_writer is not None:
                    summary_writer.add_scalar("loss",
                                              valid_loss,
                                              global_step=uidx)
                    summary_writer.add_scalar("best_loss",
                                              min_history_loss,
                                              global_step=uidx)

            # ================================================================================== #
            # BLEU Validation & Early Stop
            if should_trigger_by_steps(
                    global_step=uidx,
                    n_epoch=eidx,
                    every_n_step=training_configs['bleu_valid_freq'],
                    min_step=training_configs['bleu_valid_warmup'],
                    debug=flags.debug):

                with cache_parameters(nmt_model):

                    valid_bleu = bleu_evaluation(
                        uidx=uidx,
                        valid_iterator=valid_iterator,
                        batch_size=training_configs["bleu_valid_batch_size"],
                        model=nmt_model,
                        bleu_scorer=bleu_scorer,
                        vocab_src=vocab_src,
                        vocab_tgt=vocab_tgt,
                        valid_dir=flags.valid_path,
                        max_steps=training_configs["bleu_valid_configs"]
                        ["max_steps"],
                        beam_size=training_configs["bleu_valid_configs"]
                        ["beam_size"],
                        alpha=training_configs["bleu_valid_configs"]["alpha"],
                        world_size=world_size,
                        rank=rank,
                    )

                model_collections.add_to_collection(key="history_bleus",
                                                    value=valid_bleu)

                best_valid_bleu = float(
                    np.array(model_collections.get_collection(
                        "history_bleus")).max())

                if summary_writer is not None:
                    summary_writer.add_scalar("bleu", valid_bleu, uidx)
                    summary_writer.add_scalar("best_bleu", best_valid_bleu,
                                              uidx)

                # If model get new best valid bleu score
                if valid_bleu >= best_valid_bleu:
                    bad_count = 0

                    if is_early_stop is False:
                        if rank == 0:
                            # 1. save the best model
                            torch.save(nmt_model.state_dict(),
                                       best_model_prefix + ".final")

                else:
                    bad_count += 1

                    # At least one epoch should be traversed
                    if bad_count >= training_configs[
                            'early_stop_patience'] and eidx > 0:
                        is_early_stop = True
                        WARN("Early Stop!")
                        exit(0)

                if rank == 0:
                    best_k_saver.save(global_step=uidx,
                                      metric=valid_bleu,
                                      model=nmt_model,
                                      optim=optim,
                                      lr_scheduler=scheduler,
                                      collections=model_collections,
                                      ma=ma)

                # ODC
                if training_configs['use_odc'] is True:
                    if valid_bleu >= best_valid_bleu:
                        pass

                        # choose method to generate teachers from checkpoints
                        # - best
                        # - ave_k_best
                        # - ma

                        if training_configs['teacher_choice'] == 'ma':
                            teacher_params = ma.export_ma_params()
                        elif training_configs['teacher_choice'] == 'best':
                            teacher_params = nmt_model.state_dict()
                        elif "ave_best" in training_configs['teacher_choice']:
                            if best_k_saver.num_saved >= ave_best_k:
                                teacher_params = average_checkpoints(
                                    best_k_saver.get_all_ckpt_path()
                                    [-ave_best_k:])
                            else:
                                teacher_params = nmt_model.state_dict()
                        else:
                            raise ValueError(
                                "can not support teacher choice %s" %
                                training_configs['teacher_choice'])
                        torch.save(teacher_params, teacher_model_path)
                        del teacher_params
                        teacher_patience = 0
                        critic.set_use_KD(False)
                    else:
                        teacher_patience += 1
                        if teacher_patience >= training_configs[
                                'teacher_refresh_warmup']:
                            teacher_params = torch.load(
                                teacher_model_path,
                                map_location=Constants.CURRENT_DEVICE)
                            teacher_model.load_state_dict(teacher_params,
                                                          strict=False)
                            del teacher_params
                            critic.reset_teacher(teacher_model)
                            critic.set_use_KD(True)

                if summary_writer is not None:
                    summary_writer.add_scalar("bad_count", bad_count, uidx)

                info_str = "{0} Loss: {1:.2f} BLEU: {2:.2f} lrate: {3:6f} patience: {4} ".format(
                    uidx, valid_loss, valid_bleu, lrate, bad_count)
                for key, value in valid_loss_dict.items():
                    info_str += (key + ': {0:.2f} '.format(value))
                INFO(info_str)

            # ================================================================================== #
            # Saving checkpoints
            if should_trigger_by_steps(
                    uidx,
                    eidx,
                    every_n_step=training_configs['save_freq'],
                    debug=flags.debug):
                model_collections.add_to_collection("uidx", uidx)
                model_collections.add_to_collection("eidx", eidx)
                model_collections.add_to_collection("bad_count", bad_count)
                model_collections.add_to_collection("teacher_patience",
                                                    teacher_patience)
                if not is_early_stop:
                    if rank == 0:
                        checkpoint_saver.save(global_step=uidx,
                                              model=nmt_model,
                                              optim=optim,
                                              lr_scheduler=scheduler,
                                              collections=model_collections,
                                              ma=ma)

        if training_progress_bar is not None:
            training_progress_bar.close()

        eidx += 1
        if eidx > training_configs["max_epochs"]:
            break
Exemplo n.º 9
0
def main():
    opts = parse_args()
    init_logging(
        os.path.join(opts.log_dir,
                     '{:s}_win0_win4_log_test.txt'.format(opts.task)))

    if torch.cuda.is_available():
        torch.cuda.set_device(opts.gpu)
        logging.info("Using GPU!")
        device = "cuda"
    else:
        logging.info("Using CPU!")
        device = "cpu"

    logging.info(opts)

    test_datasets = PhoenixVideo(opts.vocab_file,
                                 opts.corpus_dir,
                                 opts.video_path,
                                 phase=opts.task,
                                 DEBUG=opts.DEBUG)
    vocab_size = test_datasets.vocab.num_words
    blank_id = test_datasets.vocab.word2index['<BLANK>']
    vocabulary = Vocabulary(opts.vocab_file)
    #     model = DilatedSLRNet(opts, device, vocab_size, vocabulary,
    #                           dilated_channels=512, num_blocks=5, dilations=[1, 2, 4], dropout=0.0)
    model = MainStream(vocab_size)
    criterion = CtcLoss(opts, blank_id, device, reduction="none")
    trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id)

    # ctcdeocde
    ctc_decoder_vocab = [chr(x) for x in range(20000, 20000 + vocab_size)]
    ctc_decoder = ctcdecode.CTCBeamDecoder(ctc_decoder_vocab,
                                           beam_width=opts.beam_width,
                                           blank_id=blank_id,
                                           num_processes=10)

    if os.path.exists(opts.check_point):
        logging.info("Loading checkpoint file from {}".format(
            opts.check_point))
        epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point)
    else:
        logging.info("No checkpoint file in found in {}".format(
            opts.check_point))
        epoch, num_updates, loss = 0, 0, 0.0

    test_iter = trainer.get_batch_iterator(test_datasets,
                                           batch_size=opts.batch_size,
                                           shuffle=False)
    decoded_dict = {}
    val_err, val_correct, val_count = np.zeros([4]), 0, 0

    with open("Data/output/hypo_ctc.txt",
              "w") as f, open("Data/output/ref_ctc.txt", "w") as f2:
        with torch.no_grad():
            model.eval()
            criterion.eval()
            for samples in tqdm(test_iter):
                samples = trainer._prepare_sample(samples)
                video = samples["data"]
                len_video = samples["len_data"]
                label = samples["label"]
                len_label = samples["len_label"]
                video_id = samples['id']

                logits, _ = model(video, len_video)
                len_video /= 4
                logits = F.softmax(logits, dim=-1)
                pred_seq, _, _, out_seq_len = ctc_decoder.decode(
                    logits, len_video)
                start = 0
                for i, length in enumerate(len_label):
                    end = start + length
                    ref = label[start:end].tolist()
                    hyp = [
                        x[0] for x in groupby(pred_seq[i][0]
                                              [:out_seq_len[i][0]].tolist())
                    ]
                    ref_sent = " ".join(
                        [vocabulary.index2word[r] for r in ref])
                    hyp_sent = " ".join(
                        [vocabulary.index2word[r] for r in hyp])
                    f.write(hyp_sent + "\n")
                    f2.write(ref_sent + "\n")

                    decoded_dict[video_id[i]] = hyp
                    val_correct += int(ref == hyp)
                    err = get_wer_delsubins(ref, hyp)
                    val_err += np.array(err)
                    val_count += 1
                    start = end
                assert end == label.size(0)
            logging.info('-' * 50)
            logging.info('Epoch: {:d}, DEV ACC: {:.5f}, {:d}/{:d}'.format(
                epoch, val_correct / val_count, val_correct, val_count))
            logging.info(
                'Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}'
                .format(epoch, val_err[0] / val_count, val_err[1] / val_count,
                        val_err[2] / val_count, val_err[3] / val_count))

            list_str_for_test = []
            for k, v in decoded_dict.items():
                start_time = 0
                for wi in v:
                    tl = np.random.random() * 0.1
                    list_str_for_test.append('{} 1 {:.3f} {:.3f} {}\n'.format(
                        k, start_time, start_time + tl,
                        test_datasets.vocab.index2word[wi]))
                    start_time += tl
            tmp_prefix = str(uuid.uuid1())
            txt_file = '{:s}.txt'.format(tmp_prefix)
            result_file = os.path.join('evaluation_relaxation', txt_file)
            with open(result_file, 'w') as fid:
                fid.writelines(list_str_for_test)
            phoenix_eval_err = get_phoenix_wer(txt_file, opts.task, tmp_prefix)
            logging.info(
                '[Relaxation Evaluation] Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}'
                .format(epoch, phoenix_eval_err[0], phoenix_eval_err[1],
                        phoenix_eval_err[2], phoenix_eval_err[3]))
            return phoenix_eval_err
Exemplo n.º 10
0
                                                early_exit=self.early_exit[0],
                                                layers=self.layers_del,
                                                **unused)
        decoder_out = F.linear(features, self.embed_word_del.weight)
        if normalize:
            return F.log_softmax(decoder_out, -1), extra['attn']
        return decoder_out, extra['attn']


def Embedding(num_embeddings, embedding_dim, padding_idx):
    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
    nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
    nn.init.constant_(m.weight[padding_idx], 0)
    return m


if __name__ == "__main__":
    from config import options
    from src.data.vocabulary import Vocabulary

    opts = options.parse_args()
    vocabulary = Vocabulary(opts.vocab_file)

    decoder = LevenshteinTransformerDecoder(opts, vocabulary)

    encoder_out = torch.randn(2, 10, 512)

    out = decoder()

    print(decoder)
Exemplo n.º 11
0
def initial_random_perturb(config_path,
                           inputs,
                           w2p, w2vocab,
                           mode="len_based",
                           key_type="token",
                           show_bleu=False):
    """
    batched random perturb, perturb is based on random probability from the collected candidates
    meant to test initial attack rate.
    :param config_path: victim configs
    :param inputs: raw batched input (list) sequences in [batch_size, seq_len]
    :param w2p: indicates how likely a word is perturbed
    :param w2vocab: near candidates
    :param mode: based on word2near_vocab, how to distribute likelihood among candidates
    :param key_type: inputs are given by raw sequences of tokens or tokenized labels
    :param show_bleu: whether to show bleu of perturbed seqs (compare to original seqs)
    :return: list of perturbed inputs and list of perturbed flags
    """
    np.random.seed(int(time.time()))
    assert mode in ["uniform", "len_based"], "Mode must be in uniform or multinomial."
    assert key_type in ["token", "label"], "inputs key type must be token or label."
    # load configs
    with open(config_path.strip()) as f:
        configs = yaml.load(f)
    data_configs = configs["data_configs"]

    # load vocabulary file and tokenize
    src_vocab = Vocabulary(**data_configs["vocabularies"][0])
    perturbed_results = []
    flags = []
    for sent in inputs:
        if np.random.uniform() < 0.5:  # perturb the sentence
            perturbed_sent = []
            if key_type == "token":
                tokenized_sent = src_vocab.tokenizer.tokenize(sent)
                for word in tokenized_sent:
                    if np.random.uniform() < w2p[word]:
                        # need to perturb on lexical level
                        if mode == "uniform":
                            # uniform choose from candidates:
                            perturbed_sent += [w2vocab[word][np.random.choice(len(w2vocab[word]),
                                                                              1)[0]]]
                        elif mode == "len_based":
                            # weighted choose from candidates:
                            weights = [1./(1+abs(len(word)-len(c))) for c in w2vocab[word]]
                            norm_weights = [c/sum(weights) for c in weights]
                            perturbed_sent += [w2vocab[word][np.random.choice(len(w2vocab[word]),
                                                                              1,
                                                                              p=norm_weights
                                                                              )[0]]]
                    else:
                        perturbed_sent += [word]
                # print(perturbed_sent)  # yield same form of sequences of tokens
                perturbed_sent = src_vocab.tokenizer.detokenize(perturbed_sent)
            elif key_type == "label":  # tokenized labels
                for word_index in sent:
                    word = src_vocab.id2token(word_index)
                    if np.random.uniform() < w2p[word]:
                        if mode == "uniform":
                            # uniform choose from candidates:
                            perturbed_label = src_vocab.token2id(w2vocab[word][np.random.choice(
                                len(w2vocab[word]), 1
                            )[0]])
                            perturbed_sent += [perturbed_label]
                        elif mode == "len_based":
                            # weighted choose from candidates:
                            weights = [1. / (1 + abs(len(word) - len(c))) for c in w2vocab[word]]
                            norm_weights = [c / sum(weights) for c in weights]
                            perturbed_label = src_vocab.token2id(w2vocab[word][np.random.choice(len(w2vocab[word]),
                                                                                                1,
                                                                                                p=norm_weights
                                                                                                )[0]])
                            perturbed_sent += [perturbed_label]
                    else:
                        perturbed_sent += [word_index]
            perturbed_results += [perturbed_sent]
            flags += [1]
            # out.write(perturbed_sent + "\n")
        else:
            perturbed_results += [sent]
            flags += [0]
    return perturbed_results, flags
Exemplo n.º 12
0
def ensemble_translate(FLAGS):
    GlobalNames.USE_GPU = FLAGS.use_gpu

    config_path = os.path.abspath(FLAGS.config_path)

    with open(config_path.strip()) as f:
        configs = yaml.load(f)

    data_configs = configs['data_configs']
    model_configs = configs['model_configs']

    timer = Timer()
    # ================================================================================== #
    # Load Data

    INFO('Loading data...')
    timer.tic()

    # Generate target dictionary
    vocab_src = Vocabulary(**data_configs["vocabularies"][0])
    vocab_tgt = Vocabulary(**data_configs["vocabularies"][1])

    valid_dataset = TextLineDataset(data_path=FLAGS.source_path,
                                    vocabulary=vocab_src)

    valid_iterator = DataIterator(dataset=valid_dataset,
                                  batch_size=FLAGS.batch_size,
                                  use_bucket=True,
                                  buffer_size=100000,
                                  numbering=True)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # ================================================================================== #
    # Build Model & Sampler & Validation
    INFO('Building model...')
    timer.tic()

    nmt_models = []

    model_path = FLAGS.model_path

    for ii in range(len(model_path)):

        nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                                n_tgt_vocab=vocab_tgt.max_n_words,
                                **model_configs)
        nmt_model.eval()
        INFO('Done. Elapsed time {0}'.format(timer.toc()))

        INFO('Reloading model parameters...')
        timer.tic()

        params = load_model_parameters(model_path[ii], map_location="cpu")

        nmt_model.load_state_dict(params)

        if GlobalNames.USE_GPU:
            nmt_model.cuda()

        nmt_models.append(nmt_model)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    INFO('Begin...')
    result_numbers = []
    result = []
    n_words = 0

    timer.tic()

    infer_progress_bar = tqdm(total=len(valid_iterator),
                              desc=' - (Infer)  ',
                              unit="sents")

    valid_iter = valid_iterator.build_generator()
    for batch in valid_iter:

        numbers, seqs_x = batch

        batch_size_t = len(seqs_x)

        x = prepare_data(seqs_x=seqs_x, cuda=GlobalNames.USE_GPU)

        with torch.no_grad():
            word_ids = ensemble_beam_search(nmt_models=nmt_models,
                                            beam_size=FLAGS.beam_size,
                                            max_steps=FLAGS.max_steps,
                                            src_seqs=x,
                                            alpha=FLAGS.alpha)

        word_ids = word_ids.cpu().numpy().tolist()

        # Append result
        for sent_t in word_ids:
            sent_t = [[wid for wid in line if wid != PAD] for line in sent_t]
            result.append(sent_t)

            n_words += len(sent_t[0])

        infer_progress_bar.update(batch_size_t)

    infer_progress_bar.close()

    INFO('Done. Speed: {0:.2f} words/sec'.format(
        n_words / (timer.toc(return_seconds=True))))

    translation = []
    for sent in result:
        samples = []
        for trans in sent:
            sample = []
            for w in trans:
                if w == vocab_tgt.EOS:
                    break
                sample.append(vocab_tgt.id2token(w))
            samples.append(vocab_tgt.tokenizer.detokenize(sample))
        translation.append(samples)

    # resume the ordering
    origin_order = np.argsort(result_numbers).tolist()
    translation = [translation[ii] for ii in origin_order]

    keep_n = FLAGS.beam_size if FLAGS.keep_n <= 0 else min(
        FLAGS.beam_size, FLAGS.keep_n)
    outputs = ['%s.%d' % (FLAGS.saveto, i) for i in range(keep_n)]

    with batch_open(outputs, 'w') as handles:
        for trans in translation:
            for i in range(keep_n):
                if i < len(trans):
                    handles[i].write('%s\n' % trans[i])
                else:
                    handles[i].write('%s\n' % 'eos')
Exemplo n.º 13
0
def tune(flags):
    """
    flags:
        saveto: str
        reload: store_true
        config_path: str
        pretrain_path: str, default=""
        model_name: str
        log_path: str
    """

    # ================================================================================== #
    # Initialization for training on different devices
    # - CPU/GPU
    # - Single/Distributed
    Constants.USE_GPU = flags.use_gpu

    if flags.multi_gpu:
        dist.distributed_init(flags.shared_dir)
        world_size = dist.get_world_size()
        rank = dist.get_rank()
        local_rank = dist.get_local_rank()
    else:
        world_size = 1
        rank = 0
        local_rank = 0

    if Constants.USE_GPU:
        torch.cuda.set_device(local_rank)
        Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank)
    else:
        Constants.CURRENT_DEVICE = "cpu"

    # If not root_rank, close logging
    # else write log of training to file.
    if rank == 0:
        write_log_to_file(
            os.path.join(flags.log_path,
                         "%s.log" % time.strftime("%Y%m%d-%H%M%S")))
    else:
        close_logging()

    # ================================================================================== #
    # Parsing configuration files
    # - Load default settings
    # - Load pre-defined settings
    # - Load user-defined settings

    configs = prepare_configs(flags.config_path, flags.predefined_config)

    data_configs = configs['data_configs']
    model_configs = configs['model_configs']
    optimizer_configs = configs['optimizer_configs']
    training_configs = configs['training_configs']

    INFO(pretty_configs(configs))

    Constants.SEED = training_configs['seed']

    set_seed(Constants.SEED)

    timer = Timer()

    # ================================================================================== #
    # Load Data
    INFO('Loading data...')
    timer.tic()

    # Generate target dictionary
    vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0])
    vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1])

    Constants.EOS = vocab_src.eos
    Constants.PAD = vocab_src.pad
    Constants.BOS = vocab_src.bos
    # bt tag dataset
    train_bitext_dataset = ZipDataset(
        TextLineDataset(data_path=data_configs['train_data'][0],
                        vocabulary=vocab_src,
                        max_len=data_configs['max_len'][0],
                        is_train_dataset=True),
        TextLineDataset(data_path=data_configs['train_data'][1],
                        vocabulary=vocab_tgt,
                        max_len=data_configs['max_len'][1],
                        is_train_dataset=True))

    training_iterator = DataIterator(
        dataset=train_bitext_dataset,
        batch_size=training_configs["batch_size"],
        use_bucket=training_configs['use_bucket'],
        buffer_size=training_configs['buffer_size'],
        batching_func=training_configs['batching_key'],
        world_size=world_size,
        rank=rank)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # ================================ Begin ======================================== #
    # Build Model & Optimizer
    # We would do steps below on after another
    #     1. build models & criterion
    #     2. move models & criterion to gpu if needed
    #     3. load pre-trained model if needed
    #     4. build optimizer
    #     5. build learning rate scheduler if needed
    #     6. load checkpoints if needed

    # 0. Initial

    lrate = optimizer_configs['learning_rate']
    model_collections = Collections()

    checkpoint_saver = Saver(
        save_prefix="{0}.ckpt".format(
            os.path.join(flags.saveto, flags.model_name)),
        num_max_keeping=training_configs['num_kept_checkpoints'])
    best_model_prefix = os.path.join(
        flags.saveto, flags.model_name + Constants.MY_BEST_MODEL_SUFFIX)
    best_model_saver = Saver(
        save_prefix=best_model_prefix,
        num_max_keeping=training_configs['num_kept_best_model'])

    # 1. Build Model & Criterion
    INFO('Building model...')
    timer.tic()
    nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                            n_tgt_vocab=vocab_tgt.max_n_words,
                            padding_idx=vocab_src.pad,
                            vocab_src=vocab_src,
                            vocab_tgt=vocab_tgt,
                            **model_configs)
    INFO(nmt_model)

    critic = NMTCriterion(label_smoothing=model_configs['label_smoothing'],
                          padding_idx=vocab_tgt.pad)

    INFO(critic)

    # 2. Move to GPU
    if Constants.USE_GPU:
        nmt_model = nmt_model.cuda()
        critic = critic.cuda()

    # 3. Load pretrained model if needed
    load_pretrained_model(nmt_model,
                          flags.pretrain_path,
                          exclude_prefix=flags.pretrain_exclude_prefix,
                          device=Constants.CURRENT_DEVICE)
    # froze_parameters
    froze_params(nmt_model, flags.froze_config)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # 4. Build optimizer
    INFO('Building Optimizer...')

    if not flags.multi_gpu:
        optim = Optimizer(name=optimizer_configs['optimizer'],
                          model=nmt_model,
                          lr=lrate,
                          grad_clip=optimizer_configs['grad_clip'],
                          optim_args=optimizer_configs['optimizer_params'],
                          update_cycle=training_configs['update_cycle'])
    else:
        optim = dist.DistributedOptimizer(
            name=optimizer_configs['optimizer'],
            model=nmt_model,
            lr=lrate,
            grad_clip=optimizer_configs['grad_clip'],
            optim_args=optimizer_configs['optimizer_params'],
            device_id=local_rank)

    # 5. Build scheduler for optimizer if needed
    scheduler = build_scheduler(
        schedule_method=optimizer_configs['schedule_method'],
        optimizer=optim,
        scheduler_configs=optimizer_configs['scheduler_configs'])

    # 6. build moving average
    if training_configs['moving_average_method'] is not None:
        ma = MovingAverage(
            moving_average_method=training_configs['moving_average_method'],
            named_params=nmt_model.named_parameters(),
            alpha=training_configs['moving_average_alpha'])
    else:
        ma = None

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # Reload from latest checkpoint
    if flags.reload:
        checkpoint_saver.load_latest(model=nmt_model,
                                     optim=optim,
                                     lr_scheduler=scheduler,
                                     collections=model_collections,
                                     ma=ma,
                                     device=Constants.CURRENT_DEVICE)

    # broadcast parameters and optimizer states
    if world_size > 1:
        INFO("Broadcasting model parameters...")
        dist.broadcast_parameters(params=nmt_model.state_dict())
        INFO("Broadcasting optimizer states...")
        dist.broadcast_optimizer_state(optimizer=optim.optim)
        INFO('Done.')

    # ================================================================================== #
    # Prepare training

    eidx = model_collections.get_collection("eidx", [0])[-1]
    uidx = model_collections.get_collection("uidx", [1])[-1]
    bad_count = model_collections.get_collection("bad_count", [0])[-1]
    oom_count = model_collections.get_collection("oom_count", [0])[-1]
    is_early_stop = model_collections.get_collection("is_early_stop", [
        False,
    ])[-1]

    train_loss_meter = AverageMeter()
    sent_per_sec_meter = TimeMeter()
    tok_per_sec_meter = TimeMeter()

    update_cycle = training_configs['update_cycle']
    grad_denom = 0
    train_loss = 0.0
    cum_n_words = 0
    valid_loss = best_valid_loss = float('inf')

    if rank == 0:
        summary_writer = SummaryWriter(log_dir=flags.log_path)
    else:
        summary_writer = None

    sent_per_sec_meter.start()
    tok_per_sec_meter.start()

    INFO('Begin training...')

    while True:

        if summary_writer is not None:
            summary_writer.add_scalar("Epoch", (eidx + 1), uidx)

        # Build iterator and progress bar
        training_iter = training_iterator.build_generator()

        if rank == 0:
            training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format(
                eidx, uidx),
                                         total=len(training_iterator),
                                         unit="sents")
        else:
            training_progress_bar = None
        # INFO(Constants.USE_BT)
        for batch in training_iter:
            # bt attrib data
            seqs_x, seqs_y = batch

            batch_size = len(seqs_x)
            cum_n_words += sum(len(s) for s in seqs_y)

            try:
                # Prepare data
                x, y = prepare_data(seqs_x, seqs_y, cuda=Constants.USE_GPU)

                loss = compute_forward(
                    model=nmt_model,
                    critic=critic,
                    seqs_x=x,
                    seqs_y=y,
                    eval=False,
                    normalization=1.0,
                    norm_by_words=training_configs["norm_by_words"])

                update_cycle -= 1
                grad_denom += batch_size
                train_loss += loss

            except RuntimeError as e:
                if 'out of memory' in str(e):
                    print('| WARNING: ran out of memory, skipping batch')
                    oom_count += 1
                else:
                    raise e

            # When update_cycle becomes 0, it means end of one batch. Several things will be done:
            # - update parameters
            # - reset update_cycle and grad_denom, update uidx
            # - learning rate scheduling
            # - update moving average

            if update_cycle == 0:

                # 0. reduce variables
                if world_size > 1:
                    grad_denom = dist.all_reduce_py(grad_denom)
                    train_loss = dist.all_reduce_py(train_loss)
                    cum_n_words = dist.all_reduce_py(cum_n_words)

                # 1. update parameters
                optim.step(denom=grad_denom)
                optim.zero_grad()

                if training_progress_bar is not None:
                    training_progress_bar.update(grad_denom)
                    training_progress_bar.set_description(
                        ' - (Epc {}, Upd {}) '.format(eidx, uidx))

                    postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format(
                        train_loss, valid_loss, best_valid_loss)
                    training_progress_bar.set_postfix_str(postfix_str)

                # 2. learning rate scheduling
                if scheduler is not None and optimizer_configs[
                        "schedule_method"] != "loss":
                    scheduler.step(global_step=uidx)

                # 3. update moving average
                if ma is not None and eidx >= training_configs[
                        'moving_average_start_epoch']:
                    ma.step()

                # 4. update meters
                train_loss_meter.update(train_loss, grad_denom)
                sent_per_sec_meter.update(grad_denom)
                tok_per_sec_meter.update(cum_n_words)

                # 5. reset accumulated variables, update uidx
                update_cycle = training_configs['update_cycle']
                grad_denom = 0
                uidx += 1
                cum_n_words = 0.0
                train_loss = 0.0

            else:
                continue

            # ================================================================================== #
            # Display some information
            if should_trigger_by_steps(
                    uidx, eidx, every_n_step=training_configs['disp_freq']):

                lrate = list(optim.get_lrate())[0]

                if summary_writer is not None:
                    summary_writer.add_scalar(
                        "Speed(sents/sec)",
                        scalar_value=sent_per_sec_meter.ave,
                        global_step=uidx)
                    summary_writer.add_scalar(
                        "Speed(words/sec)",
                        scalar_value=tok_per_sec_meter.ave,
                        global_step=uidx)
                    summary_writer.add_scalar(
                        "train_loss",
                        scalar_value=train_loss_meter.ave,
                        global_step=uidx)
                    summary_writer.add_scalar("lrate",
                                              scalar_value=lrate,
                                              global_step=uidx)
                    summary_writer.add_scalar("oom_count",
                                              scalar_value=oom_count,
                                              global_step=uidx)

                # Reset Meters
                sent_per_sec_meter.reset()
                tok_per_sec_meter.reset()
                train_loss_meter.reset()

            # ================================================================================== #
            # Saving checkpoints
            # if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=flags.debug):
            #     model_collections.add_to_collection("uidx", uidx)
            #     model_collections.add_to_collection("eidx", eidx)
            #     model_collections.add_to_collection("bad_count", bad_count)
            #
            #     if not is_early_stop:
            #         if rank == 0:
            #             checkpoint_saver.save(global_step=uidx,
            #                                   model=nmt_model,
            #                                   optim=optim,
            #                                   lr_scheduler=scheduler,
            #                                   collections=model_collections,
            #                                   ma=ma)

        torch.save(nmt_model.state_dict(), best_model_prefix + ".final")

        if training_progress_bar is not None:
            training_progress_bar.close()

        eidx += 1
        if eidx > training_configs["max_epochs"]:
            break
Exemplo n.º 14
0
class PhoenixVideo(Dataset):
    def __init__(self, vocab_file, corpus_dir, video_path, phase, DEBUG=False):
        """
        :param phase:  'train', 'dev', 'test'
        """
        self.vocab_file = vocab_file
        self.image_type = 'png'
        self.max_video_len = 300
        self.corpus_dir = corpus_dir
        self.video_path = video_path
        self.phase = phase
        self.sample = True
        self.input_shape = 112

        self.alignment = {}
        self.vocab = Vocabulary(self.vocab_file)

        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])

        self.transform = transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.RandomCrop(self.input_shape),
            transforms.ToTensor(),
            normalize,
        ])
        self.test_transform = transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.CenterCrop(self.input_shape),
            transforms.ToTensor(),
            normalize,
        ])

        self.phoenix_dataset = self.load_video_list()
        self.data_dict = self.phoenix_dataset[phase]
        if DEBUG == True:
            self.data_dict = self.data_dict[:101]

        logging.info('[DATASET: {:s}]: total {:d} samples.'.format(
            phase, len(self.data_dict)))

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        cur_vid_info = self.data_dict[idx]
        id = cur_vid_info['id']
        frames_list = self.get_images(cur_vid_info['path'])
        label = cur_vid_info['label']
        data_len = len(frames_list)  # frame number
        sample = {
            'id': id,
            'data': frames_list,
            'label': label,
            "data_len": data_len
        }
        return sample

    def load_video_list(self):
        phoenix_dataset = {}
        outliers = ['13April_2011_Wednesday_tagesschau_default-14'
                    ]  # '05July_2010_Monday_heute_default-8'
        for task in ['train', 'dev', 'test']:
            if task != self.phase:
                continue
            dataset_path = os.path.join(self.video_path, task)
            corpus = pd.read_csv(os.path.join(self.corpus_dir,
                                              '{:s}.corpus.csv'.format(task)),
                                 sep='|')
            videonames = corpus['folder'].values
            annotation = corpus['annotation'].values
            ids = corpus['id'].values
            num_sample = len(ids)
            video_infos = []
            for i in range(num_sample):
                if ids[i] in outliers:
                    continue
                tmp_info = {
                    'id':
                    ids[i],
                    'path':
                    os.path.join(self.video_path, task,
                                 videonames[i].replace('*.png', '')),
                    'label_text':
                    annotation[i],
                    'label':
                    self.sentence2index(annotation[i])
                }
                video_infos.append(tmp_info)
            phoenix_dataset[task] = video_infos
        return phoenix_dataset

    def sentence2index(self, sent):
        sent = sent.split(' ')
        s = []
        for word in sent:
            if word in self.vocab.word2index:
                s.append(self.vocab.word2index[word])
            else:
                s.append(self.vocab.word2index['<UNK>'])
        return s

    def load_video(self, video_name):
        feat = caffeFeatureLoader.loadVideoC3DFeature(video_name, 'pool5')
        feat = torch.tensor(feat)
        return feat

    def get_images(self, video_name):
        frames_list = glob.glob(
            os.path.join(video_name, '*.{:s}'.format(self.image_type)))
        frames_list.sort()
        num_frame = len(frames_list)
        if self.phase == 'train' and self.sample and num_frame > self.max_video_len:
            # first, Randomly repeat 20%. Second, Randomly delete 20%
            ids = list(range(num_frame))
            add_idx = random.sample(ids, int(0.2 * len(ids)))
            ids.extend(add_idx)
            ids.sort()
            ids = random.sample(ids, int(0.8 * len(ids)))
            ids.sort()
            if len(ids) > self.max_video_len:
                ids = random.sample(ids, self.max_video_len)
                ids.sort()
            frames_list = [frames_list[i] for i in ids]
        return frames_list

    def load_video_from_images(self, frames_list):
        frames_tensor_list = [
            self.load_image(frame_file, self.phase)
            for frame_file in frames_list
        ]
        video_tensor = torch.stack(frames_tensor_list, dim=0)
        return video_tensor

    def load_image(self, img_name, phase, reduce_mean=True):
        image = Image.open(img_name)
        if phase == "train":
            image = self.transform(image)
        elif phase == "test" or phase == "dev":
            image = self.test_transform(image)
        return image

    def collate_fn_video(self, batch, padding=6):
        # batch.sort(key=lambda x: x['data'].shape[0], reverse=True)
        len_video = [x["data_len"] for x in batch]
        len_label = [len(x['label']) for x in batch]
        batch_video = torch.zeros(len(len_video), max(len_video), 3,
                                  self.input_shape,
                                  self.input_shape)  # padding with zeros
        batch_decoder_label = torch.zeros(len(len_video),
                                          max(len_label) +
                                          2).long()  # [batch, max_len_label]
        batch_label = []
        IDs = []
        len_decoder_label = []
        for i, bat in enumerate(batch):
            data = self.load_video_from_images(bat['data'])
            label = bat['label']
            len_decoder_label.append(len_label[i] + 2)
            batch_label.extend(label)
            batch_decoder_label[i, 1:1 + len(label)] = torch.LongTensor(label)
            batch_decoder_label[i, 0] = self.vocab.bos()  # bos
            batch_decoder_label[i, 1 + len(label)] = self.vocab.eos()  # eos
            batch_video[i, :len_video[i], :] = torch.FloatTensor(data)
            IDs.append(bat['id'])
        batch_label = torch.LongTensor(batch_label)
        batch_decoder_label = torch.LongTensor(batch_decoder_label)
        len_video = torch.LongTensor(len_video)
        len_label = torch.LongTensor(len_label)
        len_decoder_label = torch.LongTensor(len_decoder_label)

        # batch_video = batch_video.permute(0, 2, 1)

        return {
            'data': batch_video,
            'label': batch_label,
            'decoder_label': batch_decoder_label,
            'len_data': len_video,
            'len_label': len_label,
            'len_decoder_label': len_decoder_label,
            'id': IDs
        }
Exemplo n.º 15
0
def ensemble_inference(valid_iterator,
                       models,
                       vocab_tgt: Vocabulary,
                       batch_size,
                       max_steps,
                       beam_size=5,
                       alpha=-1.0,
                       rank=0,
                       world_size=1,
                       using_numbering_iterator=True):
    for model in models:
        model.eval()

    trans_in_all_beams = [[] for _ in range(beam_size)]

    # assert keep_n_beams <= beam_size

    if using_numbering_iterator:
        numbers = []

    if rank == 0:
        infer_progress_bar = tqdm(total=len(valid_iterator),
                                  desc=' - (Infer)  ',
                                  unit="sents")
    else:
        infer_progress_bar = None

    valid_iter = valid_iterator.build_generator(batch_size=batch_size)

    for batch in valid_iter:

        seq_numbers = batch[0]

        if using_numbering_iterator:
            numbers += seq_numbers

        seqs_x = batch[1]

        if infer_progress_bar is not None:
            infer_progress_bar.update(len(seqs_x) * world_size)

        x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU)

        with torch.no_grad():
            word_ids = ensemble_beam_search(nmt_models=models,
                                            beam_size=beam_size,
                                            max_steps=max_steps,
                                            src_seqs=x,
                                            alpha=alpha)

        word_ids = word_ids.cpu().numpy().tolist()

        # Append result
        for sent_t in word_ids:
            for ii, sent_ in enumerate(sent_t):
                sent_ = vocab_tgt.ids2sent(sent_)
                if sent_ == "":
                    sent_ = '%s' % vocab_tgt.id2token(vocab_tgt.eos)
                trans_in_all_beams[ii].append(sent_)

    if infer_progress_bar is not None:
        infer_progress_bar.close()

    if world_size > 1:
        if using_numbering_iterator:
            numbers = dist.all_gather_py_with_shared_fs(numbers)

        trans_in_all_beams = [
            combine_from_all_shards(trans) for trans in trans_in_all_beams
        ]

    if using_numbering_iterator:
        origin_order = np.argsort(numbers).tolist()
        trans_in_all_beams = [[trans[ii] for ii in origin_order]
                              for trans in trans_in_all_beams]

    return trans_in_all_beams
Exemplo n.º 16
0
def translate(FLAGS):
    GlobalNames.USE_GPU = FLAGS.use_gpu

    if FLAGS.multi_gpu:

        if hvd is None or distributed is None:
            ERROR("Distributed training is disable. Please check the installation of Horovod.")

        hvd.init()
        world_size = hvd.size()
        rank = hvd.rank()

        if GlobalNames.USE_GPU:
            torch.cuda.set_device(hvd.local_rank())
    else:
        world_size = 1
        rank = 0

    if rank != 0:
        close_logging()

    config_path = os.path.abspath(FLAGS.config_path)

    with open(config_path.strip()) as f:
        configs = yaml.load(f)

    data_configs = configs['data_configs']
    model_configs = configs['model_configs']

    timer = Timer()
    # ================================================================================== #
    # Load Data

    INFO('Loading data...')
    timer.tic()

    # Generate target dictionary
    vocab_src = Vocabulary(**data_configs["vocabularies"][0])
    vocab_tgt = Vocabulary(**data_configs["vocabularies"][1])

    valid_dataset = TextLineDataset(data_path=FLAGS.source_path,
                                    vocabulary=vocab_src)

    valid_iterator = DataIterator(dataset=valid_dataset,
                                  batch_size=FLAGS.batch_size,
                                  use_bucket=True,
                                  buffer_size=100000,
                                  numbering=True,
                                  world_size=world_size,
                                  rank=rank
                                  )

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # ================================================================================== #
    # Build Model & Sampler & Validation
    INFO('Building model...')
    timer.tic()
    nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                            n_tgt_vocab=vocab_tgt.max_n_words, **model_configs)
    nmt_model.eval()
    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    INFO('Reloading model parameters...')
    timer.tic()

    params = load_model_parameters(FLAGS.model_path, map_location="cpu")

    nmt_model.load_state_dict(params, strict=False)

    if GlobalNames.USE_GPU:
        nmt_model.cuda()

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    INFO('Begin...')

    result_numbers = []
    result = []
    n_words = 0

    timer.tic()

    if rank == 0:
        infer_progress_bar = tqdm(total=len(valid_iterator),
                                  desc=' - (Infer)  ',
                                  unit="sents")
    else:
        infer_progress_bar = None

    valid_iter = valid_iterator.build_generator()

    for batch in valid_iter:

        numbers, seqs_x = batch

        batch_size_t = len(seqs_x)

        x = prepare_data(seqs_x=seqs_x, cuda=GlobalNames.USE_GPU)

        with torch.no_grad():
            word_ids = beam_search(nmt_model=nmt_model, beam_size=FLAGS.beam_size, max_steps=FLAGS.max_steps,
                                   src_seqs=x, alpha=FLAGS.alpha)

        word_ids = word_ids.cpu().numpy().tolist()

        # Append result
        for sent_t in word_ids:
            sent_t = [[wid for wid in line if wid != PAD] for line in sent_t]
            result.append(sent_t)

            n_words += len(sent_t[0])

        result_numbers += numbers

        if rank == 0:
            infer_progress_bar.update(batch_size_t * world_size)

    if rank == 0:
        infer_progress_bar.close()

    if FLAGS.multi_gpu:
        n_words = sum(distributed.all_gather(n_words))

    INFO('Done. Speed: {0:.2f} words/sec'.format(n_words / (timer.toc(return_seconds=True))))

    if FLAGS.multi_gpu:

        result_gathered = distributed.all_gather_with_shared_fs(result)

        result = []

        for lines in itertools.zip_longest(*result_gathered, fillvalue=None):
            for line in lines:
                if line is not None:
                    result.append(line)

        result_numbers_gathered = distributed.all_gather_with_shared_fs(result_numbers)

        result_numbers = []

        for numbers in itertools.zip_longest(*result_numbers_gathered, fillvalue=None):
            for num in numbers:
                if num is not None:
                    result_numbers.append(num)

    if rank == 0:
        translation = []
        for sent in result:
            samples = []
            for trans in sent:
                sample = []
                for w in trans:
                    if w == vocab_tgt.EOS:
                        break
                    sample.append(vocab_tgt.id2token(w))
                samples.append(vocab_tgt.tokenizer.detokenize(sample))
            translation.append(samples)

        # resume the ordering
        origin_order = np.argsort(result_numbers).tolist()
        translation = [translation[ii] for ii in origin_order]

        keep_n = FLAGS.beam_size if FLAGS.keep_n <= 0 else min(FLAGS.beam_size, FLAGS.keep_n)
        outputs = ['%s.%d' % (FLAGS.saveto, i) for i in range(keep_n)]

        with batch_open(outputs, 'w') as handles:
            for trans in translation:
                for i in range(keep_n):
                    if i < len(trans):
                        handles[i].write('%s\n' % trans[i])
                    else:
                        handles[i].write('%s\n' % 'eos')
Exemplo n.º 17
0
def train(FLAGS):
    """
    FLAGS:
        saveto: str
        reload: store_true
        config_path: str
        pretrain_path: str, default=""
        model_name: str
        log_path: str
    """

    # ================================================================================== #
    # Initialization for training on different devices
    # - CPU/GPU
    # - Single/Distributed
    GlobalNames.USE_GPU = FLAGS.use_gpu

    if FLAGS.multi_gpu:

        if hvd is None or distributed is None:
            ERROR("Distributed training is disable. Please check the installation of Horovod.")

        hvd.init()
        world_size = hvd.size()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
    else:
        world_size = 1
        rank = 0
        local_rank = 0

    if GlobalNames.USE_GPU:
        torch.cuda.set_device(local_rank)
        CURRENT_DEVICE = "cuda:{0}".format(local_rank)
    else:
        CURRENT_DEVICE = "cpu"

    # If not root_rank, close logging
    if rank != 0:
        close_logging()

    # write log of training to file.
    if rank == 0:
        write_log_to_file(os.path.join(FLAGS.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S")))

    # ================================================================================== #
    # Parsing configuration files

    config_path = os.path.abspath(FLAGS.config_path)
    with open(config_path.strip()) as f:
        configs = yaml.load(f)

    INFO(pretty_configs(configs))

    # Add default configs
    configs = default_baseline_configs(configs)
    data_configs = configs['data_configs']
    model_configs = configs['model_configs']
    optimizer_configs = configs['optimizer_configs']
    training_configs = configs['training_configs']

    GlobalNames.SEED = training_configs['seed']

    set_seed(GlobalNames.SEED)

    timer = Timer()

    # ================================================================================== #
    # Load Data

    INFO('Loading data...')
    timer.tic()

    # Generate target dictionary
    vocab_src = Vocabulary(**data_configs["vocabularies"][0])
    vocab_tgt = Vocabulary(**data_configs["vocabularies"][1])

    actual_buffer_size = training_configs["buffer_size"] * max(1, training_configs["update_cycle"])

    train_bitext_dataset = ZipDataset(
        TextLineDataset(data_path=data_configs['train_data'][0],
                        vocabulary=vocab_src,
                        max_len=data_configs['max_len'][0],
                        ),
        TextLineDataset(data_path=data_configs['train_data'][1],
                        vocabulary=vocab_tgt,
                        max_len=data_configs['max_len'][1],
                        )
    )

    valid_bitext_dataset = ZipDataset(
        TextLineDataset(data_path=data_configs['valid_data'][0],
                        vocabulary=vocab_src,
                        ),
        TextLineDataset(data_path=data_configs['valid_data'][1],
                        vocabulary=vocab_tgt,
                        )
    )

    training_iterator = DataIterator(dataset=train_bitext_dataset,
                                     batch_size=training_configs["batch_size"],
                                     use_bucket=training_configs['use_bucket'],
                                     buffer_size=actual_buffer_size,
                                     batching_func=training_configs['batching_key'],
                                     world_size=world_size,
                                     rank=rank)

    valid_iterator = DataIterator(dataset=valid_bitext_dataset,
                                  batch_size=training_configs['valid_batch_size'],
                                  use_bucket=True, buffer_size=100000, numbering=True,
                                  world_size=world_size, rank=rank)

    bleu_scorer = SacreBLEUScorer(reference_path=data_configs["bleu_valid_reference"],
                                  num_refs=data_configs["num_refs"],
                                  lang_pair=data_configs["lang_pair"],
                                  sacrebleu_args=training_configs["bleu_valid_configs"]['sacrebleu_args'],
                                  postprocess=training_configs["bleu_valid_configs"]['postprocess']
                                  )

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    lrate = optimizer_configs['learning_rate']
    is_early_stop = False

    # ================================ Begin ======================================== #
    # Build Model & Optimizer
    # We would do steps below on after another
    #     1. build models & criterion
    #     2. move models & criterion to gpu if needed
    #     3. load pre-trained model if needed
    #     4. build optimizer
    #     5. build learning rate scheduler if needed
    #     6. load checkpoints if needed

    # 0. Initial
    model_collections = Collections()
    best_model_prefix = os.path.join(FLAGS.saveto, FLAGS.model_name + GlobalNames.MY_BEST_MODEL_SUFFIX)

    checkpoint_saver = Saver(save_prefix="{0}.ckpt".format(os.path.join(FLAGS.saveto, FLAGS.model_name)),
                             num_max_keeping=training_configs['num_kept_checkpoints']
                             )
    best_model_saver = Saver(save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model'])

    INFO('Building model...')
    timer.tic()
    nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                            n_tgt_vocab=vocab_tgt.max_n_words, **model_configs)
    INFO(nmt_model)

    critic = NMTCriterion(label_smoothing=model_configs['label_smoothing'])

    INFO(critic)
    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # 2. Move to GPU
    if GlobalNames.USE_GPU:
        nmt_model = nmt_model.cuda()
        critic = critic.cuda()

    # 3. Load pretrained model if needed
    load_pretrained_model(nmt_model, FLAGS.pretrain_path, exclude_prefix=None, device=CURRENT_DEVICE)

    # 4. Build optimizer
    INFO('Building Optimizer...')
    optim = Optimizer(name=optimizer_configs['optimizer'],
                      model=nmt_model,
                      lr=lrate,
                      grad_clip=optimizer_configs['grad_clip'],
                      optim_args=optimizer_configs['optimizer_params'],
                      distributed=True if world_size > 1 else False,
                      update_cycle=training_configs['update_cycle']
                      )
    # 5. Build scheduler for optimizer if needed
    if optimizer_configs['schedule_method'] is not None:

        if optimizer_configs['schedule_method'] == "loss":

            scheduler = ReduceOnPlateauScheduler(optimizer=optim,
                                                 **optimizer_configs["scheduler_configs"]
                                                 )

        elif optimizer_configs['schedule_method'] == "noam":
            scheduler = NoamScheduler(optimizer=optim, **optimizer_configs['scheduler_configs'])
        else:
            WARN("Unknown scheduler name {0}. Do not use lr_scheduling.".format(optimizer_configs['schedule_method']))
            scheduler = None
    else:
        scheduler = None

    # 6. build moving average

    if training_configs['moving_average_method'] is not None:
        ma = MovingAverage(moving_average_method=training_configs['moving_average_method'],
                           named_params=nmt_model.named_parameters(),
                           alpha=training_configs['moving_average_alpha'])
    else:
        ma = None

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # Reload from latest checkpoint
    if FLAGS.reload:
        checkpoint_saver.load_latest(model=nmt_model, optim=optim, lr_scheduler=scheduler,
                                     collections=model_collections, ma=ma)

    # broadcast parameters and optimizer states
    if world_size > 1:
        hvd.broadcast_parameters(params=nmt_model.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(optimizer=optim.optim, root_rank=0)

    # ================================================================================== #
    # Prepare training

    eidx = model_collections.get_collection("eidx", [0])[-1]
    uidx = model_collections.get_collection("uidx", [1])[-1]
    bad_count = model_collections.get_collection("bad_count", [0])[-1]
    oom_count = model_collections.get_collection("oom_count", [0])[-1]
    cum_n_samples = 0
    cum_n_words = 0
    best_valid_loss = 1.0 * 1e10  # Max Float
    update_cycle = training_configs['update_cycle']
    grad_denom = 0

    if rank == 0:
        summary_writer = SummaryWriter(log_dir=FLAGS.log_path)
    else:
        summary_writer = None

    # Timer for computing speed
    timer_for_speed = Timer()
    timer_for_speed.tic()

    INFO('Begin training...')

    while True:

        if summary_writer is not None:
            summary_writer.add_scalar("Epoch", (eidx + 1), uidx)

        # Build iterator and progress bar
        training_iter = training_iterator.build_generator()

        if rank == 0:
            training_progress_bar = tqdm(desc='  - (Epoch %d)   ' % eidx,
                                         total=len(training_iterator),
                                         unit="sents"
                                         )
        else:
            training_progress_bar = None

        for batch in training_iter:

            seqs_x, seqs_y = batch

            batch_size = len(seqs_x)

            cum_n_samples += batch_size
            cum_n_words += sum(len(s) for s in seqs_y)

            try:
                # Prepare data
                x, y = prepare_data(seqs_x, seqs_y, cuda=GlobalNames.USE_GPU)

                loss = compute_forward(model=nmt_model,
                                       critic=critic,
                                       seqs_x=x,
                                       seqs_y=y,
                                       eval=False,
                                       normalization=1.0,
                                       norm_by_words=training_configs["norm_by_words"])

                update_cycle -= 1
                grad_denom += batch_size

            except RuntimeError as e:
                if 'out of memory' in str(e):
                    print('| WARNING: ran out of memory, skipping batch')
                    oom_count += 1
                else:
                    raise e

            # When update_cycle becomes 0, it means end of one batch. Several things will be done:
            # - update parameters
            # - reset update_cycle and grad_denom
            # - update uidx
            # - update moving average

            if update_cycle == 0:
                if world_size > 1:
                    grad_denom = distributed.all_reduce(grad_denom)

                optim.step(denom=grad_denom)
                optim.zero_grad()

                if training_progress_bar is not None:
                    training_progress_bar.update(grad_denom)

                update_cycle = training_configs['update_cycle']
                grad_denom = 0

                uidx += 1

                if scheduler is None:
                    pass
                elif optimizer_configs["schedule_method"] == "loss":
                    scheduler.step(metric=best_valid_loss)
                else:
                    scheduler.step(global_step=uidx)

                if ma is not None and eidx >= training_configs['moving_average_start_epoch']:
                    ma.step()
            else:
                continue

            # ================================================================================== #
            # Display some information
            if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['disp_freq']):

                if world_size > 1:
                    cum_n_words = sum(distributed.all_gather(cum_n_words))
                    cum_n_samples = sum(distributed.all_gather(cum_n_samples))

                # words per second and sents per second
                words_per_sec = cum_n_words / (timer.toc(return_seconds=True))
                sents_per_sec = cum_n_samples / (timer.toc(return_seconds=True))
                lrate = list(optim.get_lrate())[0]

                if summary_writer is not None:
                    summary_writer.add_scalar("Speed(words/sec)", scalar_value=words_per_sec, global_step=uidx)
                    summary_writer.add_scalar("Speed(sents/sen)", scalar_value=sents_per_sec, global_step=uidx)
                    summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx)
                    summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx)

                # Reset timer
                timer.tic()
                cum_n_words = 0
                cum_n_samples = 0

            # ================================================================================== #
            # Loss Validation & Learning rate annealing
            if should_trigger_by_steps(global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'],
                                       debug=FLAGS.debug):

                valid_loss = loss_validation(model=nmt_model,
                                             critic=critic,
                                             valid_iterator=valid_iterator,
                                             rank=rank,
                                             world_size=world_size
                                             )

                model_collections.add_to_collection("history_losses", valid_loss)

                min_history_loss = np.array(model_collections.get_collection("history_losses")).min()

                best_valid_loss = min_history_loss

                if summary_writer is not None:
                    summary_writer.add_scalar("loss", valid_loss, global_step=uidx)
                    summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx)

            # ================================================================================== #
            # BLEU Validation & Early Stop

            if should_trigger_by_steps(global_step=uidx, n_epoch=eidx,
                                       every_n_step=training_configs['bleu_valid_freq'],
                                       min_step=training_configs['bleu_valid_warmup'],
                                       debug=FLAGS.debug):

                valid_bleu = bleu_validation(uidx=uidx,
                                             valid_iterator=valid_iterator,
                                             batch_size=training_configs["bleu_valid_batch_size"],
                                             model=nmt_model,
                                             bleu_scorer=bleu_scorer,
                                             vocab_tgt=vocab_tgt,
                                             valid_dir=FLAGS.valid_path,
                                             max_steps=training_configs["bleu_valid_configs"]["max_steps"],
                                             beam_size=training_configs["bleu_valid_configs"]["beam_size"],
                                             alpha=training_configs["bleu_valid_configs"]["alpha"],
                                             world_size=world_size,
                                             rank=rank,
                                             )

                model_collections.add_to_collection(key="history_bleus", value=valid_bleu)

                best_valid_bleu = float(np.array(model_collections.get_collection("history_bleus")).max())

                if summary_writer is not None:
                    summary_writer.add_scalar("bleu", valid_bleu, uidx)
                    summary_writer.add_scalar("best_bleu", best_valid_bleu, uidx)

                # If model get new best valid bleu score
                if valid_bleu >= best_valid_bleu:
                    bad_count = 0

                    if is_early_stop is False:
                        if rank == 0:
                            # 1. save the best model
                            torch.save(nmt_model.state_dict(), best_model_prefix + ".final")

                            # 2. record all several best models
                            best_model_saver.save(global_step=uidx, model=nmt_model, ma=ma)
                else:
                    bad_count += 1

                    # At least one epoch should be traversed
                    if bad_count >= training_configs['early_stop_patience'] and eidx > 0:
                        is_early_stop = True
                        WARN("Early Stop!")

                if summary_writer is not None:
                    summary_writer.add_scalar("bad_count", bad_count, uidx)

                INFO("{0} Loss: {1:.2f} BLEU: {2:.2f} lrate: {3:6f} patience: {4}".format(
                    uidx, valid_loss, valid_bleu, lrate, bad_count
                ))

            # ================================================================================== #
            # Saving checkpoints
            if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=FLAGS.debug):
                model_collections.add_to_collection("uidx", uidx)
                model_collections.add_to_collection("eidx", eidx)
                model_collections.add_to_collection("bad_count", bad_count)

                if not is_early_stop:
                    if rank == 0:
                        checkpoint_saver.save(global_step=uidx,
                                              model=nmt_model,
                                              optim=optim,
                                              lr_scheduler=scheduler,
                                              collections=model_collections,
                                              ma=ma)

        if training_progress_bar is not None:
            training_progress_bar.close()

        eidx += 1
        if eidx > training_configs["max_epochs"]:
            break
Exemplo n.º 18
0
def train(FLAGS):
    """
    FLAGS:
        saveto: str
        reload: store_true
        config_path: str
        pretrain_path: str, default=""
        model_name: str
        log_path: str
    """

    # write log of training to file.
    write_log_to_file(os.path.join(FLAGS.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S")))

    GlobalNames.USE_GPU = FLAGS.use_gpu

    if GlobalNames.USE_GPU:
        CURRENT_DEVICE = "cpu"
    else:
        CURRENT_DEVICE = "cuda:0"

    config_path = os.path.abspath(FLAGS.config_path)
    with open(config_path.strip()) as f:
        configs = yaml.load(f)

    INFO(pretty_configs(configs))

    # Add default configs
    configs = default_configs(configs)
    data_configs = configs['data_configs']
    model_configs = configs['model_configs']
    optimizer_configs = configs['optimizer_configs']
    training_configs = configs['training_configs']

    GlobalNames.SEED = training_configs['seed']

    set_seed(GlobalNames.SEED)

    best_model_prefix = os.path.join(FLAGS.saveto, FLAGS.model_name + GlobalNames.MY_BEST_MODEL_SUFFIX)

    timer = Timer()

    # ================================================================================== #
    # Load Data

    INFO('Loading data...')
    timer.tic()

    # Generate target dictionary
    vocab_src = Vocabulary(**data_configs["vocabularies"][0])
    vocab_tgt = Vocabulary(**data_configs["vocabularies"][1])

    train_batch_size = training_configs["batch_size"] * max(1, training_configs["update_cycle"])
    train_buffer_size = training_configs["buffer_size"] * max(1, training_configs["update_cycle"])

    train_bitext_dataset = ZipDataset(
        TextLineDataset(data_path=data_configs['train_data'][0],
                        vocabulary=vocab_src,
                        max_len=data_configs['max_len'][0],
                        ),
        TextLineDataset(data_path=data_configs['train_data'][1],
                        vocabulary=vocab_tgt,
                        max_len=data_configs['max_len'][1],
                        ),
        shuffle=training_configs['shuffle']
    )

    valid_bitext_dataset = ZipDataset(
        TextLineDataset(data_path=data_configs['valid_data'][0],
                        vocabulary=vocab_src,
                        ),
        TextLineDataset(data_path=data_configs['valid_data'][1],
                        vocabulary=vocab_tgt,
                        )
    )

    training_iterator = DataIterator(dataset=train_bitext_dataset,
                                     batch_size=train_batch_size,
                                     use_bucket=training_configs['use_bucket'],
                                     buffer_size=train_buffer_size,
                                     batching_func=training_configs['batching_key'])

    valid_iterator = DataIterator(dataset=valid_bitext_dataset,
                                  batch_size=training_configs['valid_batch_size'],
                                  use_bucket=True, buffer_size=100000, numbering=True)

    bleu_scorer = SacreBLEUScorer(reference_path=data_configs["bleu_valid_reference"],
                                  num_refs=data_configs["num_refs"],
                                  lang_pair=data_configs["lang_pair"],
                                  sacrebleu_args=training_configs["bleu_valid_configs"]['sacrebleu_args'],
                                  postprocess=training_configs["bleu_valid_configs"]['postprocess']
                                  )

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    lrate = optimizer_configs['learning_rate']
    is_early_stop = False

    # ================================ Begin ======================================== #
    # Build Model & Optimizer
    # We would do steps below on after another
    #     1. build models & criterion
    #     2. move models & criterion to gpu if needed
    #     3. load pre-trained model if needed
    #     4. build optimizer
    #     5. build learning rate scheduler if needed
    #     6. load checkpoints if needed

    # 0. Initial
    model_collections = Collections()
    checkpoint_saver = Saver(save_prefix="{0}.ckpt".format(os.path.join(FLAGS.saveto, FLAGS.model_name)),
                             num_max_keeping=training_configs['num_kept_checkpoints']
                             )
    best_model_saver = Saver(save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model'])

    # 1. Build Model & Criterion
    INFO('Building model...')
    timer.tic()
    nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                            n_tgt_vocab=vocab_tgt.max_n_words, **model_configs)
    INFO(nmt_model)

    critic = NMTCriterion(label_smoothing=model_configs['label_smoothing'])

    INFO(critic)
    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # 2. Move to GPU
    if GlobalNames.USE_GPU:
        nmt_model = nmt_model.cuda()
        critic = critic.cuda()

    # 3. Load pretrained model if needed
    load_pretrained_model(nmt_model, FLAGS.pretrain_path, exclude_prefix=None, device=CURRENT_DEVICE)

    # 4. Build optimizer
    INFO('Building Optimizer...')
    optim = Optimizer(name=optimizer_configs['optimizer'],
                      model=nmt_model,
                      lr=lrate,
                      grad_clip=optimizer_configs['grad_clip'],
                      optim_args=optimizer_configs['optimizer_params']
                      )
    # 5. Build scheduler for optimizer if needed
    if optimizer_configs['schedule_method'] is not None:

        if optimizer_configs['schedule_method'] == "loss":

            scheduler = ReduceOnPlateauScheduler(optimizer=optim,
                                                 **optimizer_configs["scheduler_configs"]
                                                 )

        elif optimizer_configs['schedule_method'] == "noam":
            scheduler = NoamScheduler(optimizer=optim, **optimizer_configs['scheduler_configs'])
        else:
            WARN("Unknown scheduler name {0}. Do not use lr_scheduling.".format(optimizer_configs['schedule_method']))
            scheduler = None
    else:
        scheduler = None

    # 6. build EMA
    if training_configs['ema_decay'] > 0.0:
        ema = ExponentialMovingAverage(named_params=nmt_model.named_parameters(), decay=training_configs['ema_decay'])
    else:
        ema = None

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # Reload from latest checkpoint
    if FLAGS.reload:
        checkpoint_saver.load_latest(model=nmt_model, optim=optim, lr_scheduler=scheduler,
                                     collections=model_collections)

    # ================================================================================== #
    # Prepare training

    eidx = model_collections.get_collection("eidx", [0])[-1]
    uidx = model_collections.get_collection("uidx", [0])[-1]
    bad_count = model_collections.get_collection("bad_count", [0])[-1]

    summary_writer = SummaryWriter(log_dir=FLAGS.log_path)

    cum_samples = 0
    cum_words = 0
    best_valid_loss = 1.0 * 1e10  # Max Float
    saving_files = []

    # Timer for computing speed
    timer_for_speed = Timer()
    timer_for_speed.tic()

    INFO('Begin training...')

    while True:

        summary_writer.add_scalar("Epoch", (eidx + 1), uidx)

        # Build iterator and progress bar
        training_iter = training_iterator.build_generator()
        training_progress_bar = tqdm(desc='  - (Epoch %d)   ' % eidx,
                                     total=len(training_iterator),
                                     unit="sents"
                                     )
        for batch in training_iter:

            uidx += 1

            if scheduler is None:
                pass
            elif optimizer_configs["schedule_method"] == "loss":
                scheduler.step(metric=best_valid_loss)
            else:
                scheduler.step(global_step=uidx)

            seqs_x, seqs_y = batch

            n_samples_t = len(seqs_x)
            n_words_t = sum(len(s) for s in seqs_y)

            cum_samples += n_samples_t
            cum_words += n_words_t

            training_progress_bar.update(n_samples_t)

            optim.zero_grad()

            # Prepare data
            for seqs_x_t, seqs_y_t in split_shard(seqs_x, seqs_y, split_size=training_configs['update_cycle']):
                x, y = prepare_data(seqs_x_t, seqs_y_t, cuda=GlobalNames.USE_GPU)

                loss = compute_forward(model=nmt_model,
                                       critic=critic,
                                       seqs_x=x,
                                       seqs_y=y,
                                       eval=False,
                                       normalization=n_samples_t,
                                       norm_by_words=training_configs["norm_by_words"])
            optim.step()

            if ema is not None:
                ema.step()

            # ================================================================================== #
            # Display some information
            if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['disp_freq']):
                # words per second and sents per second
                words_per_sec = cum_words / (timer.toc(return_seconds=True))
                sents_per_sec = cum_samples / (timer.toc(return_seconds=True))
                lrate = list(optim.get_lrate())[0]

                summary_writer.add_scalar("Speed(words/sec)", scalar_value=words_per_sec, global_step=uidx)
                summary_writer.add_scalar("Speed(sents/sen)", scalar_value=sents_per_sec, global_step=uidx)
                summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx)

                # Reset timer
                timer.tic()
                cum_words = 0
                cum_samples = 0

            # ================================================================================== #
            # Saving checkpoints
            if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=FLAGS.debug):
                model_collections.add_to_collection("uidx", uidx)
                model_collections.add_to_collection("eidx", eidx)
                model_collections.add_to_collection("bad_count", bad_count)

                if not is_early_stop:

                    checkpoint_saver.save(global_step=uidx, model=nmt_model, optim=optim, lr_scheduler=scheduler,
                                          collections=model_collections, ema=ema)

            # ================================================================================== #
            # Loss Validation & Learning rate annealing
            if should_trigger_by_steps(global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'],
                                       debug=FLAGS.debug):

                if ema is not None:
                    origin_state_dict = deepcopy(nmt_model.state_dict())
                    nmt_model.load_state_dict(ema.state_dict(), strict=False)

                valid_loss = loss_validation(model=nmt_model,
                                             critic=critic,
                                             valid_iterator=valid_iterator,
                                             )

                model_collections.add_to_collection("history_losses", valid_loss)

                min_history_loss = np.array(model_collections.get_collection("history_losses")).min()

                summary_writer.add_scalar("loss", valid_loss, global_step=uidx)
                summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx)

                best_valid_loss = min_history_loss

                if ema is not None:
                    nmt_model.load_state_dict(origin_state_dict)
                    del origin_state_dict

            # ================================================================================== #
            # BLEU Validation & Early Stop

            if should_trigger_by_steps(global_step=uidx, n_epoch=eidx,
                                       every_n_step=training_configs['bleu_valid_freq'],
                                       min_step=training_configs['bleu_valid_warmup'],
                                       debug=FLAGS.debug):

                if ema is not None:
                    origin_state_dict = deepcopy(nmt_model.state_dict())
                    nmt_model.load_state_dict(ema.state_dict(), strict=False)

                valid_bleu = bleu_validation(uidx=uidx,
                                             valid_iterator=valid_iterator,
                                             batch_size=training_configs["bleu_valid_batch_size"],
                                             model=nmt_model,
                                             bleu_scorer=bleu_scorer,
                                             vocab_tgt=vocab_tgt,
                                             valid_dir=FLAGS.valid_path,
                                             max_steps=training_configs["bleu_valid_configs"]["max_steps"],
                                             beam_size=training_configs["bleu_valid_configs"]["beam_size"],
                                             alpha=training_configs["bleu_valid_configs"]["alpha"]
                                             )

                model_collections.add_to_collection(key="history_bleus", value=valid_bleu)

                best_valid_bleu = float(np.array(model_collections.get_collection("history_bleus")).max())

                summary_writer.add_scalar("bleu", valid_bleu, uidx)
                summary_writer.add_scalar("best_bleu", best_valid_bleu, uidx)

                # If model get new best valid bleu score
                if valid_bleu >= best_valid_bleu:
                    bad_count = 0

                    if is_early_stop is False:
                        # 1. save the best model
                        torch.save(nmt_model.state_dict(), best_model_prefix + ".final")

                        # 2. record all several best models
                        best_model_saver.save(global_step=uidx, model=nmt_model)
                else:
                    bad_count += 1

                    # At least one epoch should be traversed
                    if bad_count >= training_configs['early_stop_patience'] and eidx > 0:
                        is_early_stop = True
                        WARN("Early Stop!")

                summary_writer.add_scalar("bad_count", bad_count, uidx)

                if ema is not None:
                    nmt_model.load_state_dict(origin_state_dict)
                    del origin_state_dict

                INFO("{0} Loss: {1:.2f} BLEU: {2:.2f} lrate: {3:6f} patience: {4}".format(
                    uidx, valid_loss, valid_bleu, lrate, bad_count
                ))

        training_progress_bar.close()

        eidx += 1
        if eidx > training_configs["max_epochs"]:
            break
Exemplo n.º 19
0
def main():
    opts = parse_args()
    init_logging(os.path.join(opts.log_dir, '{:s}_log.txt'.format(opts.task)))

    if torch.cuda.is_available():
        torch.cuda.set_device(opts.gpu)
        logging.info("Using GPU!")
        device = "cuda"
    else:
        logging.info("Using CPU!")
        device = "cpu"

    logging.info(opts)

    train_datasets = PhoenixVideo(opts.vocab_file,
                                  opts.corpus_dir,
                                  opts.video_path,
                                  phase="train",
                                  DEBUG=opts.DEBUG)
    valid_datasets = PhoenixVideo(opts.vocab_file,
                                  opts.corpus_dir,
                                  opts.video_path,
                                  phase="dev",
                                  DEBUG=opts.DEBUG)
    vocab_size = valid_datasets.vocab.num_words
    blank_id = valid_datasets.vocab.word2index['<BLANK>']
    vocabulary = Vocabulary(opts.vocab_file)
    #model = DilatedSLRNet(opts, device, vocab_size, vocabulary,
    #                      dilated_channels=512, num_blocks=5, dilations=[1, 2, 4], dropout=0.0)
    model = MainStream(vocab_size)
    criterion = CtcLoss(opts, blank_id, device, reduction="none")

    # print(model)

    # Build trainer
    trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id)

    if os.path.exists(opts.check_point):
        logging.info("Loading checkpoint file from {}".format(
            opts.check_point))
        epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point)
    else:
        logging.info("No checkpoint file in found in {}".format(
            opts.check_point))
        epoch, num_updates, loss = 0, 0, 0.0

    trainer.set_num_updates(num_updates)
    model_manager = ModelManager(max_num_models=5)
    while epoch < opts.max_epoch and trainer.get_num_updates(
    ) < opts.max_updates:
        epoch += 1
        trainer.adjust_learning_rate(epoch)
        #trainer.dynamic_freeze_layers(epoch)
        loss = train(opts, train_datasets, valid_datasets, trainer, epoch,
                     num_updates, loss)

        #if num_updates % opts.save_interval_updates == 0:
        if epoch <= opts.stage_epoch * 2:
            phoenix_eval_err = eval(opts, valid_datasets, trainer, epoch)
            phoenix_eval_err = eval_tf(opts, valid_datasets, trainer, epoch)
        else:
            phoenix_eval_err = eval(opts, valid_datasets, trainer, epoch)
            phoenix_eval_err = eval_dec(opts, valid_datasets, trainer, epoch)

        save_ckpt = os.path.join(
            opts.log_dir, 'ep{:d}_{:.4f}.pkl'.format(epoch,
                                                     phoenix_eval_err[0]))
        trainer.save_checkpoint(save_ckpt, epoch, num_updates, loss)
        model_manager.update(save_ckpt, phoenix_eval_err, epoch)
Exemplo n.º 20
0
def main():
    opts = parse_args()
    setup_seed(opts.seed)
    init_logging(os.path.join(opts.log_dir, '{:s}_seed{}_log.txt'.format(opts.task, opts.seed)))

    if torch.cuda.is_available():
        torch.cuda.set_device(opts.gpu)
        logging.info("Using GPU!")
        device = "cuda"
    else:
        logging.info("Using CPU!")
        device = "cpu"
        
    logging.info(opts)

    train_datasets = PhoenixVideo(opts.vocab_file, opts.corpus_dir, opts.video_path, phase="train", DEBUG=opts.DEBUG)
    valid_datasets = PhoenixVideo(opts.vocab_file, opts.corpus_dir, opts.video_path, phase="dev", DEBUG=opts.DEBUG)
    vocab_size = valid_datasets.vocab.num_words
    blank_id = valid_datasets.vocab.word2index['<BLANK>']
    vocabulary = Vocabulary(opts.vocab_file)
    model = MainStream(vocab_size, opts.bn_momentum)
    criterion = CtcLoss(opts, blank_id, device, reduction="none")
    ema = EMA(model, decay=0.999)
    # 初始化
    ema.register()

    # print(model)
    # Build trainer
    trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id)

    if os.path.exists(opts.check_point):
        logging.info("Loading checkpoint file from {}".format(opts.check_point))
        epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point)
    elif os.path.exists(opts.pretrain):
        logging.info("Loading checkpoint file from {}".format(opts.pretrain))
        trainer.pretrain(opts)
        epoch, num_updates, loss = 0, 0, 0.0
    else:
        logging.info("No checkpoint file in found in {}".format(opts.check_point))
        epoch, num_updates, loss = 0, 0, 0.0

    logging.info('| num. module params: {} (num. trained: {})'.format(
        sum(p.numel() for p in model.parameters()),
        sum(p.numel() for p in model.parameters() if p.requires_grad),
    ))

    trainer.set_num_updates(num_updates)
    model_manager = ModelManager(max_num_models=25)
    while epoch < opts.max_epoch and trainer.get_num_updates() < opts.max_updates:
        epoch += 1
        trainer.adjust_learning_rate(epoch)
        loss = train(opts, train_datasets, valid_datasets, trainer, epoch, num_updates, loss, ema)

        if epoch <= opts.stage_epoch:
            eval_train(opts, train_datasets, trainer, epoch)
            # phoenix_eval_err = eval_tf(opts, valid_datasets, trainer, epoch)
            phoenix_eval_err = eval(opts, valid_datasets, trainer, epoch, ema)
        else:
            # eval_train(opts, train_datasets, trainer, epoch)
            phoenix_eval_err = eval(opts, valid_datasets, trainer, epoch, ema)

        save_ckpt = os.path.join(opts.log_dir, 'ep{:d}_{:.4f}.pkl'.format(epoch, phoenix_eval_err[0]))
        trainer.save_checkpoint(save_ckpt, epoch, num_updates, loss)
        model_manager.update(save_ckpt, phoenix_eval_err, epoch)
Exemplo n.º 21
0
def main():
    opts = parse_args()
    init_logging(os.path.join(opts.log_dir, '{:s}_log.txt'.format(opts.task)))

    if torch.cuda.is_available():
        torch.cuda.set_device(opts.gpu)
        logging.info("Using GPU!")
        device = "cuda"
    else:
        logging.info("Using CPU!")
        device = "cpu"

    logging.info(opts)

    test_datasets = PhoenixVideo(opts.vocab_file,
                                 opts.corpus_dir,
                                 opts.video_path,
                                 phase="train",
                                 DEBUG=opts.DEBUG)
    vocab_size = test_datasets.vocab.num_words
    blank_id = test_datasets.vocab.word2index['<BLANK>']
    vocabulary = Vocabulary(opts.vocab_file)
    #     model = DilatedSLRNet(opts, device, vocab_size, vocabulary,
    #                           dilated_channels=512, num_blocks=5, dilations=[1, 2, 4], dropout=0.0)
    model = MainStream(vocab_size)
    criterion = CtcLoss(opts, blank_id, device, reduction="none")
    trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id)

    # ctcdeocde
    ctc_decoder_vocab = [chr(x) for x in range(20000, 20000 + vocab_size)]
    ctc_decoder = ctcdecode.CTCBeamDecoder(ctc_decoder_vocab,
                                           beam_width=opts.beam_width,
                                           blank_id=blank_id,
                                           num_processes=10)

    if os.path.exists(opts.check_point):
        logging.info("Loading checkpoint file from {}".format(
            opts.check_point))
        epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point)
    else:
        logging.info("No checkpoint file in found in {}".format(
            opts.check_point))
        epoch, num_updates, loss = 0, 0, 0.0

    test_iter = trainer.get_batch_iterator(test_datasets,
                                           batch_size=opts.batch_size,
                                           shuffle=False)

    video_sim = {}

    with torch.no_grad():
        model.eval()
        criterion.eval()
        for i, samples in tqdm(enumerate(test_iter)):
            if i > 50:
                break
            samples = trainer._prepare_sample(samples)
            video = samples["data"]
            len_video = samples["len_data"]
            label = samples["label"]
            len_label = samples["len_label"]
            video_id = samples['id']

            logits, _, scores1, scores2 = model(video, len_video)
            print(scores1)
            ids = scores1.topk(k=16, dim=-1)[1].sort(-1)[0]  # [bs, t, t]
            bs, t, _ = scores1.size()
            for i in range(bs):
                for j in range(t):
                    select_id = ids[i, j, :].cpu().numpy().tolist()
                    for k in range(t):
                        if k not in select_id:
                            scores1[i, j, k] = 1e-9
            print("scores1: ", scores1)
            scores1 = scores1.softmax(-1)

            mask = scores1 > 0.02
            print(scores1, mask)
            scores1 *= mask.float()
            # sim_matrix = scores1.softmax(-1)
            # print(scores1[0, 0, :20])
            # exit()
            for i in range(len(video_id)):
                video_sim[video_id[i]] = scores1[i].cpu().numpy()
    # print(video_sim)
    with open("Data/output/sim_matrix.pkl", "wb") as f:
        pickle.dump(video_sim, f)
Exemplo n.º 22
0
def load_or_extract_near_vocab(config_path,
                               model_path,
                               save_to,
                               save_to_full,
                               init_perturb_rate=0,
                               batch_size=50,
                               top_reserve=12,
                               all_with_UNK=False,
                               reload=True,
                               emit_as_id=False):
    """based on the embedding parameter from Encoder, extract near vocabulary for all words
    return: dictionary of vocabulary of near vocabs; and a the saved file
    :param config_path: (string) victim configs (for training data and vocabulary)
    :param model_path: (string) victim model path for trained embeddings
    :param save_to: (string) directory to store distilled near-vocab
    :param save_to_full: (string) directory to store full near-vocab
    :param init_perturb_rate: (float) the weight-adjustment for perturb
    :param batch_size: (integer) extract near vocab by batched cosine/Euclidean-similarity
    :param top_reserve: (integer) at most reserve top-k near candidates
    :param all_with_UNK: during generation, add UNK to all tokens as a candidate
    :param reload: reload from the save_to_path if previous record exists
    :param emit_as_id: (boolean) the key in return will be token ids instead of token
    """
    # load configs
    with open(config_path.strip()) as f:
        configs = yaml.load(f)
    data_configs = configs["data_configs"]
    model_configs = configs["model_configs"]
    # load vocabulary file
    src_vocab = Vocabulary(**data_configs["vocabularies"][0])

    # load embedding from model
    emb = nn.Embedding(num_embeddings=src_vocab.max_n_words,
                       embedding_dim=model_configs["d_word_vec"],
                       padding_idx=PAD
                       )
    model_params = torch.load(model_path, map_location="cpu")
    emb.load_state_dict({"weight": model_params["model"]["encoder.embeddings.embeddings.weight"]},
                        strict=True)
    len_mat = torch.sum(emb.weight**2, dim=1)**0.5  # length of the embeddings

    if os.path.exists(save_to) and reload:
        print("load from %s:" % save_to)
        return load_perturb_weight(save_to, src_vocab, emit_as_id)
    else:
        print("collect near candidates for vocabulary")
        avg_dist = 0
        avg_std = []
        counter = 0
        word2p = OrderedDict()
        word2near_vocab = OrderedDict()
        # omit similar vocabulary file (batched)
        with open(save_to, "w") as similar_vocab, open(save_to_full, "w") as full_similar_vocab:
            # every batched vocabulary collect average E-dist
            for i in range((src_vocab.max_n_words//batch_size)+1):
                if i*batch_size == src_vocab.max_n_words:
                    break

                index = torch.tensor(range(i*batch_size,
                                           min(src_vocab.max_n_words, (i+1)*batch_size),
                                           1))
                # extract embedding data
                slice_emb = emb(index)
                collect_len = torch.mm(len_mat.narrow(0, i * batch_size, min(src_vocab.max_n_words, (i+1)*batch_size)-i*batch_size).unsqueeze(1),
                                       len_mat.unsqueeze(0))
                # filter top 10 nearest vocab, then filter with Eul-distance within certain range
                similarity = torch.mm(slice_emb,
                                      emb.weight.t()).div(collect_len)
                # get value and index
                topk_index = similarity.topk(top_reserve, dim=1)[1]

                sliceemb = slice_emb.unsqueeze(dim=1).repeat(1, top_reserve, 1)  # [batch_size, 1*8, dim]
                E_dist = ((emb(topk_index)-sliceemb)**2).sum(dim=-1)**0.5
                # print("avg Euclidean distance:", E_dist)
                avg_dist += E_dist.mean()
                avg_std += [E_dist.std(dim=1).mean()]
                counter += 1
            avg_dist = avg_dist.item() / counter
            # print(avg_dist)  # tensor object
            # print(avg_std)

            # output near candidates to file and return dictionary
            for i in range((src_vocab.max_n_words//batch_size)+1):
                if i*batch_size == src_vocab.max_n_words:
                    break
                index = torch.tensor(range(i*batch_size,
                                           min(src_vocab.max_n_words, (i+1)*batch_size),
                                           1))
                # extract embedding data
                slice_emb = emb(index)
                collect_len = torch.mm(len_mat.narrow(0, i * batch_size, min(src_vocab.max_n_words, (i+1)*batch_size)-i*batch_size).unsqueeze(1),
                                       len_mat.unsqueeze(0))
                # filter top k nearest vocab with cosine-similarity
                similarity = torch.mm(slice_emb,
                                      emb.weight.t()).div(collect_len)
                topk_val, topk_indices = similarity.topk(top_reserve, dim=1)
                # calculate E-dist
                sliceemb = slice_emb.unsqueeze(dim=1).repeat(1, top_reserve, 1)  # [batch_size, 1*topk, dim]
                E_dist = ((emb(topk_indices)-sliceemb)**2).sum(dim=-1)**0.5

                topk_val = E_dist.cpu().detach().numpy()
                topk_indices = topk_indices.cpu().detach().numpy()
                for j in range(topk_val.shape[0]):
                    bingo = 0.
                    src_word_id = j + i*batch_size

                    src_word = src_vocab.id2token(src_word_id)
                    near_vocab = []

                    similar_vocab.write(src_word + "\t")
                    full_similar_vocab.write(src_word + "\t")

                    # there is no candidates for reserved tokens
                    if src_word_id in [PAD, EOS, BOS, UNK]:
                        near_cand_id = src_word_id
                        near_cand = src_vocab.id2token(near_cand_id)

                        full_similar_vocab.write(near_cand + "\t")
                        similar_vocab.write(near_cand + "\t")
                        bingo = 1
                        if emit_as_id:
                            near_vocab += [near_cand_id]
                        else:
                            near_vocab += [near_cand]
                    else:
                        # extract near candidates according to cos-dist within averaged E-dist
                        for k in range(1, topk_val.shape[1]):
                            near_cand_id = topk_indices[j][k]
                            near_cand = src_vocab.id2token(near_cand_id)
                            full_similar_vocab.write(near_cand + "\t")
                            if topk_val[j][k] < avg_dist and (near_cand_id not in [PAD, EOS, BOS]):
                                bingo += 1
                                similar_vocab.write(near_cand + "\t")
                                if emit_as_id:
                                    near_vocab += [near_cand_id]
                                else:
                                    near_vocab += [near_cand]
                        # additionally add UNK as candidates
                        if bingo == 0 or all_with_UNK:
                            last_cand_ids = [UNK]
                            for final_reserve_id in last_cand_ids:
                                last_cand = src_vocab.id2token(final_reserve_id)
                                similar_vocab.write(last_cand + "\t")
                                if emit_as_id:
                                    near_vocab += [final_reserve_id]
                                else:
                                    near_vocab += [last_cand]

                    probability = bingo/(len(src_word)*top_reserve)
                    if init_perturb_rate != 0:
                        probability *= init_perturb_rate
                    similar_vocab.write("\t"+str(probability)+"\n")
                    full_similar_vocab.write("\t"+str(probability)+"\n")
                    if emit_as_id:
                        word2near_vocab[src_word_id] = near_vocab
                        word2p[src_word_id] = probability
                    else:
                        word2near_vocab[src_word] = near_vocab
                        word2p[src_word] = probability
        return word2p, word2near_vocab
Exemplo n.º 23
0
def main_2():
    opts = parse_args()
    init_logging(os.path.join(opts.log_dir, '{:s}_log.txt'.format(opts.task)))

    if torch.cuda.is_available():
        torch.cuda.set_device(opts.gpu)
        logging.info("Using GPU!")
        device = "cuda"
    else:
        logging.info("Using CPU!")
        device = "cpu"

    logging.info(opts)

    test_datasets = PhoenixVideo(opts.vocab_file,
                                 opts.corpus_dir,
                                 opts.video_path,
                                 phase=opts.task,
                                 DEBUG=opts.DEBUG)
    vocab_size = test_datasets.vocab.num_words
    blank_id = test_datasets.vocab.word2index['<BLANK>']
    vocabulary = Vocabulary(opts.vocab_file)
    model = DilatedSLRNet(opts,
                          device,
                          vocab_size,
                          vocabulary,
                          dilated_channels=512,
                          num_blocks=5,
                          dilations=[1, 2, 4],
                          dropout=0.0)
    criterion = CtcLoss(opts, blank_id, device, reduction="none")
    trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id)

    # iterative decoder
    dec_generator = IterativeGenerate(vocabulary, model)

    if os.path.exists(opts.check_point):
        logging.info("Loading checkpoint file from {}".format(
            opts.check_point))
        epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point)
    else:
        logging.info("No checkpoint file in found in {}".format(
            opts.check_point))
        epoch, num_updates, loss = 0, 0, 0.0

    test_iter = trainer.get_batch_iterator(test_datasets,
                                           batch_size=opts.batch_size,
                                           shuffle=False)
    decoded_dict = {}
    with torch.no_grad():
        model.eval()
        criterion.eval()
        val_err, val_correct, val_count = np.zeros([4]), 0, 0
        for samples in tqdm(test_iter):
            samples = trainer._prepare_sample(samples)
            video = samples["data"]
            len_video = samples["len_data"]
            label = samples["label"]
            len_label = samples["len_label"]
            video_id = samples['id']

            hypos = dec_generator.generate_ctcdecode(video, len_video)

            start = 0
            for i, length in enumerate(len_label):
                end = start + length
                ref = label[start:end].tolist()
                # hyp = [x for x in pred_seq[i] if x != 0]
                # hyp = [x[0] for x in groupby(pred_seq[i][0][:out_seq_len[i][0]].tolist())]
                hyp = trainer.post_process_prediction(hypos[i][0]["tokens"])
                # if i == 0:
                #     if len(hyp) == 0:
                #         logging.info("Here hyp is None!!!!")
                #     logging.info("video id: {}".format(video_id[i]))
                #     logging.info("ref: {}".format(" ".join(str(i) for i in ref)))
                #     logging.info("hyp: {}".format(" ".join(str(i) for i in hyp)))
                #
                #     logging.info("\n")
                decoded_dict[video_id[i]] = hyp
                val_correct += int(ref == hyp)
                err = get_wer_delsubins(ref, hyp)
                val_err += np.array(err)
                val_count += 1
                start = end
            assert end == label.size(0)
        logging.info('-' * 50)
        logging.info('Epoch: {:d}, DEV ACC: {:.5f}, {:d}/{:d}'.format(
            epoch, val_correct / val_count, val_correct, val_count))
        logging.info(
            'Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}'
            .format(epoch, val_err[0] / val_count, val_err[1] / val_count,
                    val_err[2] / val_count, val_err[3] / val_count))

        list_str_for_test = []
        for k, v in decoded_dict.items():
            start_time = 0
            for wi in v:
                tl = np.random.random() * 0.1
                list_str_for_test.append('{} 1 {:.3f} {:.3f} {}\n'.format(
                    k, start_time, start_time + tl,
                    test_datasets.vocab.index2word[wi]))
                start_time += tl
        tmp_prefix = str(uuid.uuid1())
        txt_file = '{:s}.txt'.format(tmp_prefix)
        result_file = os.path.join('evaluation_relaxation', txt_file)
        with open(result_file, 'w') as fid:
            fid.writelines(list_str_for_test)
        phoenix_eval_err = get_phoenix_wer(txt_file, opts.task, tmp_prefix)
        logging.info(
            '[Relaxation Evaluation] Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}'
            .format(epoch, phoenix_eval_err[0], phoenix_eval_err[1],
                    phoenix_eval_err[2], phoenix_eval_err[3]))
        return phoenix_eval_err
Exemplo n.º 24
0
def train(flags):
    """
    flags:
        saveto: str
        reload: store_true
        config_path: str
        pretrain_path: str, default=""
        model_name: str
        log_path: str
    """

    # ================================================================================== #
    # Initialization for training on different devices
    # - CPU/GPU
    # - Single/Distributed
    Constants.USE_GPU = flags.use_gpu

    world_size = 1
    rank = 0
    local_rank = 0

    if Constants.USE_GPU:
        torch.cuda.set_device(local_rank)
        Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank)
    else:
        Constants.CURRENT_DEVICE = "cpu"

    # If not root_rank, close logging
    # else write log of training to file.
    if rank == 0:
        write_log_to_file(
            os.path.join(flags.log_path,
                         "%s.log" % time.strftime("%Y%m%d-%H%M%S")))
    else:
        close_logging()

    # ================================================================================== #
    # Parsing configuration files
    # - Load default settings
    # - Load pre-defined settings
    # - Load user-defined settings

    configs = prepare_configs(flags.config_path, flags.predefined_config)

    data_configs = configs['data_configs']
    model_configs = configs['model_configs']
    optimizer_configs = configs['optimizer_configs']
    training_configs = configs['training_configs']

    INFO(pretty_configs(configs))

    Constants.SEED = training_configs['seed']

    set_seed(Constants.SEED)

    timer = Timer()

    # ================================================================================== #
    # Load Data

    INFO('Loading data...')
    timer.tic()

    # Generate target dictionary
    vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0])

    Constants.EOS = vocab_src.eos
    Constants.PAD = vocab_src.pad
    Constants.BOS = vocab_src.bos

    train_bitext_dataset = TextLineDataset(
        data_path=data_configs['train_data'][0],
        vocabulary=vocab_src,
        max_len=data_configs['max_len'][0],
        is_train_dataset=True)

    valid_bitext_dataset = TextLineDataset(
        data_path=data_configs['valid_data'][0],
        vocabulary=vocab_src,
        is_train_dataset=False)

    training_iterator = DataIterator(
        dataset=train_bitext_dataset,
        batch_size=training_configs["batch_size"],
        use_bucket=training_configs['use_bucket'],
        buffer_size=training_configs['buffer_size'],
        batching_func=training_configs['batching_key'],
        world_size=world_size,
        rank=rank)
    valid_iterator = DataIterator(
        dataset=valid_bitext_dataset,
        batch_size=training_configs['valid_batch_size'],
        use_bucket=True,
        buffer_size=100000,
        numbering=True,
        shuffle=False,
        world_size=world_size,
        rank=rank)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # ================================ Begin ======================================== #
    # Build Model & Optimizer
    # We would do steps below on after another
    #     1. build models & criterion
    #     2. move models & criterion to gpu if needed
    #     3. load pre-trained model if needed
    #     4. build optimizer
    #     5. build learning rate scheduler if needed
    #     6. load checkpoints if needed

    # 0. Initial

    model_collections = Collections()

    checkpoint_saver = Saver(
        save_prefix="{0}.ckpt".format(
            os.path.join(flags.saveto, flags.model_name)),
        num_max_keeping=training_configs['num_kept_checkpoints'])
    best_model_prefix = os.path.join(
        flags.saveto, flags.model_name + Constants.MY_BEST_MODEL_SUFFIX)
    best_model_saver = Saver(
        save_prefix=best_model_prefix,
        num_max_keeping=training_configs['num_kept_best_model'])

    # 1. Build Model & Criterion
    INFO('Building model...')
    timer.tic()
    nmt_model = build_model(vocab_size=vocab_src.max_n_words,
                            padding_idx=vocab_src.pad,
                            vocab_src=vocab_src,
                            **model_configs)
    INFO(nmt_model)
    # 损失函数
    critic = torch.nn.CrossEntropyLoss(ignore_index=Constants.PAD)
    INFO(critic)

    # 2. Move to GPU
    if Constants.USE_GPU:
        nmt_model = nmt_model.cuda()
        critic = critic.cuda()

    # 3. Load pretrained model if needed
    load_pretrained_model(nmt_model,
                          flags.pretrain_path,
                          exclude_prefix=flags.pretrain_exclude_prefix,
                          device=Constants.CURRENT_DEVICE)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # 4. Build optimizer
    INFO('Building Optimizer...')
    optimizer = torch.optim.Adam(nmt_model.parameters(),
                                 lr=optimizer_configs['learning_rate'])

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # ================================================================================== #
    # Prepare training

    eidx = model_collections.get_collection("eidx", [0])[-1]
    uidx = model_collections.get_collection("uidx", [1])[-1]
    bad_count = model_collections.get_collection("bad_count", [0])[-1]
    oom_count = model_collections.get_collection("oom_count", [0])[-1]
    is_early_stop = model_collections.get_collection("is_early_stop", [
        False,
    ])[-1]

    train_loss_meter = AverageMeter()
    sent_per_sec_meter = TimeMeter()
    tok_per_sec_meter = TimeMeter()

    grad_denom = 0
    train_loss = 0.0
    cum_n_words = 0
    valid_loss = best_valid_loss = float('inf')

    if rank == 0:
        summary_writer = SummaryWriter(log_dir=flags.log_path)
    else:
        summary_writer = None

    sent_per_sec_meter.start()
    tok_per_sec_meter.start()

    INFO('Begin training...')

    while True:

        if summary_writer is not None:
            summary_writer.add_scalar("Epoch", (eidx + 1), uidx)

        # Build iterator and progress bar
        training_iter = training_iterator.build_generator()

        if rank == 0:
            training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format(
                eidx, uidx),
                                         total=len(training_iterator),
                                         unit="sents")
        else:
            training_progress_bar = None

        for batch in training_iter:
            seqs_x = batch

            batch_size = len(seqs_x)
            cum_n_words = 0.0
            train_loss = 0.0

            try:
                # Prepare data
                grad_denom += batch_size
                x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU)
                nmt_model.train()
                critic.train()
                critic.zero_grad()
                with torch.enable_grad():
                    logits = nmt_model(x[:-1])
                    logits = logits.view(-1, vocab_src.max_n_words)
                    trg = x[1:]
                    trg = trg.view(-1)
                    loss = critic(logits, trg)
                    loss.backward()
                    optimizer.step()
                    valid_token = (trg != Constants.PAD).long().sum().item()
                    cum_n_words += valid_token
                    train_loss += loss.item() * valid_token

            except RuntimeError as e:
                if 'out of memory' in str(e):
                    print('| WARNING: ran out of memory, skipping batch')
                    oom_count += 1
                else:
                    raise e

            # When update_cycle becomes 0, it means end of one batch. Several things will be done:
            # - update parameters
            # - reset update_cycle and grad_denom, update uidx
            # - learning rate scheduling
            # - update moving average

            if training_progress_bar is not None:
                training_progress_bar.update(grad_denom)
                training_progress_bar.set_description(
                    ' - (Epc {}, Upd {}) '.format(eidx, uidx))

                postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format(
                    train_loss / cum_n_words, valid_loss, best_valid_loss)
                training_progress_bar.set_postfix_str(postfix_str)

            # 4. update meters
            train_loss_meter.update(train_loss, cum_n_words)
            sent_per_sec_meter.update(grad_denom)
            tok_per_sec_meter.update(cum_n_words)

            # 5. reset accumulated variables, update uidx
            grad_denom = 0
            uidx += 1
            cum_n_words = 0.0
            train_loss = 0.0

            # ================================================================================== #
            # Display some information
            if should_trigger_by_steps(
                    uidx, eidx, every_n_step=training_configs['disp_freq']):

                if summary_writer is not None:
                    summary_writer.add_scalar(
                        "Speed(sents/sec)",
                        scalar_value=sent_per_sec_meter.ave,
                        global_step=uidx)
                    summary_writer.add_scalar(
                        "Speed(words/sec)",
                        scalar_value=tok_per_sec_meter.ave,
                        global_step=uidx)
                    summary_writer.add_scalar(
                        "train_loss",
                        scalar_value=train_loss_meter.ave,
                        global_step=uidx)
                    summary_writer.add_scalar("oom_count",
                                              scalar_value=oom_count,
                                              global_step=uidx)

                # Reset Meters
                sent_per_sec_meter.reset()
                tok_per_sec_meter.reset()
                train_loss_meter.reset()

            # ================================================================================== #
            # Loss Validation & Learning rate annealing
            if should_trigger_by_steps(
                    global_step=uidx,
                    n_epoch=eidx,
                    every_n_step=training_configs['loss_valid_freq'],
                    min_step=training_configs['bleu_valid_warmup'],
                    debug=flags.debug):

                valid_iter = valid_iterator.build_generator()
                valid_loss = 0
                total_tokens = 0
                for batch in valid_iter:
                    seq_number, seqs_x = batch
                    x = prepare_data(seqs_x,
                                     seqs_y=None,
                                     cuda=Constants.USE_GPU)
                    nmt_model.eval()
                    critic.eval()
                    with torch.no_grad():
                        logits = nmt_model(x[:-1])
                        logits = logits.view(-1, vocab_src.max_n_words)
                        trg = x[1:]
                        valid_token = (trg != Constants.PAD).sum(-1)
                        batch_size, seq_len = trg.shape
                        trg = trg.view(-1)
                        # loss = critic(logits, trg)
                        # valid_token = (trg != Constants.PAD).long().sum().item()
                        # total_tokens += valid_token
                        # valid_loss += loss.item() * valid_token
                        import torch.nn.functional as F
                        loss = F.cross_entropy(logits,
                                               trg,
                                               reduce=False,
                                               ignore_index=vocab_src.pad)
                        loss = loss.view(batch_size, seq_len)
                        loss = loss.sum(-1)
                        print(seq_number)
                        print(loss.double().div(valid_token.double()))
                exit(0)
                valid_loss = valid_loss / total_tokens
                model_collections.add_to_collection("history_losses",
                                                    valid_loss)

                min_history_loss = np.array(
                    model_collections.get_collection("history_losses")).min()
                best_valid_loss = min_history_loss
                if summary_writer is not None:
                    summary_writer.add_scalar("loss",
                                              valid_loss,
                                              global_step=uidx)
                    summary_writer.add_scalar("best_loss",
                                              min_history_loss,
                                              global_step=uidx)
                # If model get new best valid bleu score
                if valid_loss <= best_valid_loss:
                    bad_count = 0

                    if is_early_stop is False:
                        if rank == 0:
                            # 1. save the best model
                            torch.save(nmt_model.state_dict(),
                                       best_model_prefix + ".final")

                            # 2. record all several best models
                            best_model_saver.save(
                                global_step=uidx,
                                model=nmt_model,
                                optimizer=optimizer,
                                collections=model_collections)
                    else:
                        bad_count += 1

                        # At least one epoch should be traversed
                        if bad_count >= training_configs[
                                'early_stop_patience'] and eidx > 0:
                            is_early_stop = True
                            WARN("Early Stop!")
                            exit(0)

                if summary_writer is not None:
                    summary_writer.add_scalar("bad_count", bad_count, uidx)

                INFO("{0} Loss: {1:.2f}  patience: {2}".format(
                    uidx, valid_loss, bad_count))

            # ================================================================================== #
            # # Saving checkpoints
            # if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=flags.debug):
            #     model_collections.add_to_collection("uidx", uidx)
            #     model_collections.add_to_collection("eidx", eidx)
            #     model_collections.add_to_collection("bad_count", bad_count)
            #
            #     if not is_early_stop:
            #         if rank == 0:
            #             checkpoint_saver.save(global_step=uidx,
            #                                   model=nmt_model,
            #                                   optim=optimizer,
            #                                   collections=model_collections)

        if training_progress_bar is not None:
            training_progress_bar.close()

        eidx += 1
        if eidx > training_configs["max_epochs"]:
            break
Exemplo n.º 25
0
def main():
    opts = parse_args()
    init_logging(os.path.join(opts.log_dir, '{:s}_log.txt'.format(opts.task)))

    if torch.cuda.is_available():
        torch.cuda.set_device(opts.gpu)
        logging.info("Using GPU!")
        device = "cuda"
    else:
        logging.info("Using CPU!")
        device = "cpu"

    logging.info(opts)

    test_datasets = PhoenixVideo(opts.vocab_file,
                                 opts.corpus_dir,
                                 opts.video_path,
                                 phase="train",
                                 DEBUG=opts.DEBUG,
                                 sample=False)
    vocab_size = test_datasets.vocab.num_words
    blank_id = test_datasets.vocab.word2index['<BLANK>']
    pad_id = test_datasets.vocab.pad()
    vocabulary = Vocabulary(opts.vocab_file)
    # model = DilatedSLRNet(opts, device, vocab_size, vocabulary,
    #                       dilated_channels=512, num_blocks=5, dilations=[1, 2, 4], dropout=0.0)
    model = MainStream(vocab_size)
    criterion = CtcLoss(opts, blank_id, device, reduction="none")
    trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id)

    # ctcdeocde
    ctc_decoder_vocab = [chr(x) for x in range(20000, 20000 + vocab_size)]
    ctc_decoder = ctcdecode.CTCBeamDecoder(ctc_decoder_vocab,
                                           beam_width=opts.beam_width,
                                           blank_id=blank_id,
                                           num_processes=10)

    if os.path.exists(opts.check_point):
        logging.info("Loading checkpoint file from {}".format(
            opts.check_point))
        epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point)
    else:
        logging.info("No checkpoint file in found in {}".format(
            opts.check_point))
        epoch, num_updates, loss = 0, 0, 0.0

    test_iter = trainer.get_batch_iterator(test_datasets,
                                           batch_size=opts.batch_size,
                                           shuffle=False)

    with torch.no_grad():
        model.eval()
        criterion.eval()
        prob_results = {}
        for i, samples in enumerate(test_iter):
            if i > 500:
                break
            samples = trainer._prepare_sample(samples)
            video = samples["data"]
            len_video = samples["len_data"]
            label = samples["label"]
            len_label = samples["len_label"]
            video_id = samples['id']
            dec_label = samples["decoder_label"]
            len_dec_label = samples["len_decoder_label"]

            # print("video: ", video.shape)
            logits, _ = model(video, len_video)
            len_video /= 4
            # print("logits: ", logits.shape)
            # print(len_video)

            params = logits[0, :len_video[0], :].transpose(
                1, 0).detach().cpu().numpy()  # [T, vocab_size]
            seq = dec_label[0, :len_dec_label[0]].cpu().numpy()
            alignment = get_alignment(params,
                                      seq,
                                      blank=blank_id,
                                      is_prob=False)  # [length]
            # print("video_id:", video_id[0])
            # print("gt label:", seq)
            # print("alignment:", alignment)

            probs = logits.softmax(-1)[0]  # [length ,vocab_size]
            align_probs = []
            for i in range(alignment.shape[0]):
                align_probs.append(
                    probs[i, alignment[i]].detach().cpu().numpy().tolist())
            # print(align_probs)
            # exit()
            count = 0
            total_cnt = 0
            for i in range(len(align_probs)):
                total_cnt += 1
                if alignment[i] == blank_id:
                    align_probs[i] = 0
                    count += 1
            print(
                "video_id: {}, and blank count / total count: {}/{} = {:.4f}".
                format(video_id[0], count, total_cnt, count / total_cnt))
            prob_results[video_id[0]] = (align_probs, alignment)
            # print(align_probs)
    return prob_results
Exemplo n.º 26
0
def train(FLAGS):
    """
    FLAGS:
        saveto: str
        reload: store_true
        config_path: str
        pretrain_path: str, default=""
        model_name: str
        log_path: str
    """

    # write log of training to file.
    write_log_to_file(
        os.path.join(FLAGS.log_path,
                     "%s.log" % time.strftime("%Y%m%d-%H%M%S")))

    GlobalNames.USE_GPU = FLAGS.use_gpu

    if GlobalNames.USE_GPU:
        CURRENT_DEVICE = "cpu"
    else:
        CURRENT_DEVICE = "cuda:0"

    config_path = os.path.abspath(FLAGS.config_path)
    with open(config_path.strip()) as f:
        configs = yaml.load(f)

    INFO(pretty_configs(configs))

    # Add default configs
    configs = default_configs(configs)
    data_configs = configs['data_configs']
    model_configs = configs['model_configs']
    optimizer_configs = configs['optimizer_configs']
    training_configs = configs['training_configs']

    GlobalNames.SEED = training_configs['seed']

    set_seed(GlobalNames.SEED)

    best_model_prefix = os.path.join(
        FLAGS.saveto, FLAGS.model_name + GlobalNames.MY_BEST_MODEL_SUFFIX)

    timer = Timer()

    # ================================================================================== #
    # Load Data

    INFO('Loading data...')
    timer.tic()

    # Generate target dictionary
    vocab_tgt = Vocabulary(**data_configs["vocabularies"][0])

    train_batch_size = training_configs["batch_size"] * max(
        1, training_configs["update_cycle"])
    train_buffer_size = training_configs["buffer_size"] * max(
        1, training_configs["update_cycle"])

    train_bitext_dataset = ZipDataset(TextLineDataset(
        data_path=data_configs['train_data'][0],
        vocabulary=vocab_tgt,
        max_len=data_configs['max_len'][0],
    ),
                                      shuffle=training_configs['shuffle'])

    valid_bitext_dataset = ZipDataset(
        TextLineDataset(
            data_path=data_configs['valid_data'][0],
            vocabulary=vocab_tgt,
        ))

    training_iterator = DataIterator(
        dataset=train_bitext_dataset,
        batch_size=train_batch_size,
        use_bucket=training_configs['use_bucket'],
        buffer_size=train_buffer_size,
        batching_func=training_configs['batching_key'])

    valid_iterator = DataIterator(
        dataset=valid_bitext_dataset,
        batch_size=training_configs['valid_batch_size'],
        use_bucket=True,
        buffer_size=100000,
        numbering=True)

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    lrate = optimizer_configs['learning_rate']
    is_early_stop = False

    # ================================ Begin ======================================== #
    # Build Model & Optimizer
    # We would do steps below on after another
    #     1. build models & criterion
    #     2. move models & criterion to gpu if needed
    #     3. load pre-trained model if needed
    #     4. build optimizer
    #     5. build learning rate scheduler if needed
    #     6. load checkpoints if needed

    # 0. Initial
    model_collections = Collections()
    checkpoint_saver = Saver(
        save_prefix="{0}.ckpt".format(
            os.path.join(FLAGS.saveto, FLAGS.model_name)),
        num_max_keeping=training_configs['num_kept_checkpoints'])
    best_model_saver = Saver(
        save_prefix=best_model_prefix,
        num_max_keeping=training_configs['num_kept_best_model'])

    # 1. Build Model & Criterion
    INFO('Building model...')
    timer.tic()
    lm_model = build_model(n_tgt_vocab=vocab_tgt.max_n_words, **model_configs)
    INFO(lm_model)

    params_total = sum([p.numel() for n, p in lm_model.named_parameters()])
    params_with_embedding = sum([
        p.numel() for n, p in lm_model.named_parameters()
        if n.find('embedding') == -1
    ])
    INFO('Total parameters: {}'.format(params_total))
    INFO('Total parameters (excluding word embeddings): {}'.format(
        params_with_embedding))

    critic = NMTCriterion(label_smoothing=model_configs['label_smoothing'])

    INFO(critic)
    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # 2. Move to GPU
    if GlobalNames.USE_GPU:
        lm_model = lm_model.cuda()
        critic = critic.cuda()

    # 3. Load pretrained model if needed
    lm_model.init_parameters(FLAGS.pretrain_path, device=CURRENT_DEVICE)

    # 4. Build optimizer
    INFO('Building Optimizer...')
    optim = Optimizer(name=optimizer_configs['optimizer'],
                      model=lm_model,
                      lr=lrate,
                      grad_clip=optimizer_configs['grad_clip'],
                      optim_args=optimizer_configs['optimizer_params'])

    # 5. Build scheduler for optimizer if needed
    if optimizer_configs['schedule_method'] is not None:

        if optimizer_configs['schedule_method'] == "loss":

            scheduler = ReduceOnPlateauScheduler(
                optimizer=optim, **optimizer_configs["scheduler_configs"])

        elif optimizer_configs['schedule_method'] == "noam":
            scheduler = NoamScheduler(optimizer=optim,
                                      **optimizer_configs['scheduler_configs'])
        else:
            WARN(
                "Unknown scheduler name {0}. Do not use lr_scheduling.".format(
                    optimizer_configs['schedule_method']))
            scheduler = None
    else:
        scheduler = None

    # 6. build moving average

    if training_configs['moving_average_method'] is not None:
        ma = MovingAverage(
            moving_average_method=training_configs['moving_average_method'],
            named_params=lm_model.named_parameters(),
            alpha=training_configs['moving_average_alpha'])
    else:
        ma = None

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    # Reload from latest checkpoint
    if FLAGS.reload:
        checkpoint_saver.load_latest(model=lm_model,
                                     optim=optim,
                                     lr_scheduler=scheduler,
                                     collections=model_collections,
                                     ma=ma)

    # ================================================================================== #
    # Prepare training

    eidx = model_collections.get_collection("eidx", [0])[-1]
    uidx = model_collections.get_collection("uidx", [0])[-1]
    bad_count = model_collections.get_collection("bad_count", [0])[-1]
    oom_count = model_collections.get_collection("oom_count", [0])[-1]

    summary_writer = SummaryWriter(log_dir=FLAGS.log_path)

    cum_samples = 0
    cum_words = 0
    valid_loss = best_valid_loss = float('inf')  # Max Float
    saving_files = []

    # Timer for computing speed
    timer_for_speed = Timer()
    timer_for_speed.tic()

    INFO('Begin training...')

    while True:
        summary_writer.add_scalar("Epoch", (eidx + 1), uidx)

        # Build iterator and progress bar
        training_iter = training_iterator.build_generator()
        training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format(
            eidx, uidx),
                                     total=len(training_iterator),
                                     unit="sents")
        for batch in training_iter:

            uidx += 1

            if optimizer_configs[
                    "schedule_method"] is not None and optimizer_configs[
                        "schedule_method"] != "loss":
                scheduler.step(global_step=uidx)

            seqs_y = batch

            n_samples_t = len(seqs_y)
            n_words_t = sum(len(s) for s in seqs_y)

            cum_samples += n_samples_t
            cum_words += n_words_t

            train_loss = 0.
            optim.zero_grad()
            try:
                # Prepare data
                for (seqs_y_t, ) in split_shard(
                        seqs_y, split_size=training_configs['update_cycle']):
                    y = prepare_data(seqs_y_t, cuda=GlobalNames.USE_GPU)

                    loss = compute_forward(
                        model=lm_model,
                        critic=critic,
                        # seqs_x=x,
                        seqs_y=y,
                        eval=False,
                        normalization=n_samples_t,
                        norm_by_words=training_configs["norm_by_words"])
                    train_loss += loss / y.size(
                        1) if not training_configs["norm_by_words"] else loss
                optim.step()

            except RuntimeError as e:
                if 'out of memory' in str(e):
                    print('| WARNING: ran out of memory, skipping batch')
                    oom_count += 1
                    optim.zero_grad()
                else:
                    raise e

            if ma is not None and eidx >= training_configs[
                    'moving_average_start_epoch']:
                ma.step()

            training_progress_bar.update(n_samples_t)
            training_progress_bar.set_description(
                ' - (Epc {}, Upd {}) '.format(eidx, uidx))
            training_progress_bar.set_postfix_str(
                'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f})'.format(
                    train_loss, valid_loss, best_valid_loss))
            summary_writer.add_scalar("train_loss",
                                      scalar_value=train_loss,
                                      global_step=uidx)

            # ================================================================================== #
            # Display some information
            if should_trigger_by_steps(
                    uidx, eidx, every_n_step=training_configs['disp_freq']):
                # words per second and sents per second
                words_per_sec = cum_words / (timer.toc(return_seconds=True))
                sents_per_sec = cum_samples / (timer.toc(return_seconds=True))
                lrate = list(optim.get_lrate())[0]

                summary_writer.add_scalar("Speed(words/sec)",
                                          scalar_value=words_per_sec,
                                          global_step=uidx)
                summary_writer.add_scalar("Speed(sents/sen)",
                                          scalar_value=sents_per_sec,
                                          global_step=uidx)
                summary_writer.add_scalar("lrate",
                                          scalar_value=lrate,
                                          global_step=uidx)
                summary_writer.add_scalar("oom_count",
                                          scalar_value=oom_count,
                                          global_step=uidx)

                # Reset timer
                timer.tic()
                cum_words = 0
                cum_samples = 0

            # ================================================================================== #
            # Saving checkpoints
            if should_trigger_by_steps(
                    uidx,
                    eidx,
                    every_n_step=training_configs['save_freq'],
                    debug=FLAGS.debug):
                model_collections.add_to_collection("uidx", uidx)
                model_collections.add_to_collection("eidx", eidx)
                model_collections.add_to_collection("bad_count", bad_count)

                if not is_early_stop:
                    checkpoint_saver.save(global_step=uidx,
                                          model=lm_model,
                                          optim=optim,
                                          lr_scheduler=scheduler,
                                          collections=model_collections,
                                          ma=ma)

            # ================================================================================== #
            # Loss Validation & Learning rate annealing
            if should_trigger_by_steps(
                    global_step=uidx,
                    n_epoch=eidx,
                    every_n_step=training_configs['loss_valid_freq'],
                    debug=FLAGS.debug):

                if ma is not None:
                    origin_state_dict = deepcopy(lm_model.state_dict())
                    lm_model.load_state_dict(ma.export_ma_params(),
                                             strict=False)

                valid_loss = loss_validation(
                    model=lm_model,
                    critic=critic,
                    valid_iterator=valid_iterator,
                    norm_by_words=training_configs["norm_by_words"])

                model_collections.add_to_collection("history_losses",
                                                    valid_loss)

                min_history_loss = np.array(
                    model_collections.get_collection("history_losses")).min()

                summary_writer.add_scalar("loss", valid_loss, global_step=uidx)
                summary_writer.add_scalar("best_loss",
                                          min_history_loss,
                                          global_step=uidx)

                if ma is not None:
                    lm_model.load_state_dict(origin_state_dict)
                    del origin_state_dict

                if optimizer_configs["schedule_method"] == "loss":
                    scheduler.step(metric=best_valid_loss)

                # If model get new best valid loss
                if valid_loss < best_valid_loss:
                    bad_count = 0

                    if is_early_stop is False:
                        # 1. save the best model
                        torch.save(lm_model.state_dict(),
                                   best_model_prefix + ".final")

                        # 2. record all several best models
                        best_model_saver.save(global_step=uidx, model=lm_model)
                else:
                    bad_count += 1

                    # At least one epoch should be traversed
                    if bad_count >= training_configs[
                            'early_stop_patience'] and eidx > 0:
                        is_early_stop = True
                        WARN("Early Stop!")

                best_valid_loss = min_history_loss

                summary_writer.add_scalar("bad_count", bad_count, uidx)

                INFO("{0} Loss: {1:.2f} lrate: {2:6f} patience: {3}".format(
                    uidx, valid_loss, lrate, bad_count))

        training_progress_bar.close()

        eidx += 1
        if eidx > training_configs["max_epochs"]:
            break
Exemplo n.º 27
0
def interactive_FBS(FLAGS):
    patience = FLAGS.try_times
    GlobalNames.USE_GPU = FLAGS.use_gpu
    config_path = os.path.abspath(FLAGS.config_path)

    with open(config_path.strip()) as f:
        configs = yaml.load(f)

    data_configs = configs['data_configs']
    model_configs = configs['model_configs']

    timer = Timer()
    #===================================================================================
    #load data
    INFO('loading data...')
    timer.tic()

    vocab_src = Vocabulary(**data_configs["vocabularies"][0])
    vocab_tgt = Vocabulary(**data_configs["vocabularies"][1])

    valid_dataset = TextLineDataset(data_path=FLAGS.source_path,
                                    vocabulary=vocab_src)
    valid_iterator = DataIterator(dataset=valid_dataset,
                                  batch_size=FLAGS.batch_size,
                                  use_bucket=True,
                                  buffer_size=100000,
                                  numbering=True)

    valid_ref = []
    with open(FLAGS.ref_path) as f:
        for sent in f:
            valid_ref.append(vocab_tgt.sent2ids(sent))

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    #===================================================================================
    #build Model & Sampler & Validation
    INFO('Building model...')
    critic = NMTCriterion(label_smoothing=model_configs['label_smoothing'])

    INFO(critic)

    # 2. Move to GPU
    if GlobalNames.USE_GPU:
        critic = critic.cuda()

    timer.tic()
    fw_nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                               n_tgt_vocab=vocab_tgt.max_n_words,
                               **model_configs)

    #bw_nmt_model = None
    bw_nmt_model = build_model(n_src_vocab=vocab_src.max_n_words,
                               n_tgt_vocab=vocab_tgt.max_n_words,
                               **model_configs)
    fw_nmt_model.eval()
    bw_nmt_model.eval()

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    INFO('Reloading model parameters...')
    timer.tic()

    fw_params = load_model_parameters(FLAGS.fw_model_path, map_location="cpu")
    bw_params = load_model_parameters(FLAGS.bw_model_path, map_location="cpu")

    fw_nmt_model.load_state_dict(fw_params)
    bw_nmt_model.load_state_dict(bw_params)

    if GlobalNames.USE_GPU:
        fw_nmt_model.cuda()
        bw_nmt_model.cuda()

    INFO('Done. Elapsed time {0}'.format(timer.toc()))

    INFO('begin...')
    timer.tic()

    result_numbers = []
    result = []
    n_words = 0

    imt_numbers = []
    imt_result = []
    imt_n_words = 0
    imt_constrains = [[] for ii in range(FLAGS.imt_step)]

    infer_progress_bar = tqdm(total=len(valid_iterator),
                              desc=' - (Infer)',
                              unit='sents')

    valid_iter = valid_iterator.build_generator()
    for batch in valid_iter:
        batch_result = []
        batch_numbers = []
        numbers, seqs_x = batch
        batch_size_t = len(seqs_x)

        x = prepare_data(seqs_x=seqs_x, cuda=GlobalNames.USE_GPU)

        with torch.no_grad():
            word_ids = beam_search(nmt_model=fw_nmt_model,
                                   beam_size=FLAGS.beam_size,
                                   max_steps=FLAGS.max_steps,
                                   src_seqs=x,
                                   alpha=FLAGS.alpha)

        word_ids = word_ids.cpu().numpy().tolist()

        for sent_t in word_ids:
            sent_t = [[wid for wid in line if wid != PAD] for line in sent_t]
            result.append(sent_t)
            batch_result.append(sent_t[0])

            n_words += len(sent_t[0])

        result_numbers += numbers
        imt_numbers += numbers
        batch_numbers += numbers
        batch_ref = [valid_ref[ii] for ii in batch_numbers]

        last_sents = copy.deepcopy(batch_result)
        constrains = [[[] for ii in range(patience)]
                      for jj in range(batch_size_t)]
        positions = [[[] for ii in range(patience)]
                     for jj in range(batch_size_t)]
        for idx in range(FLAGS.imt_step):
            cons, pos = sample_constrains(last_sents, batch_ref, patience)

            for ii in range(batch_size_t):
                for jj in range(patience):
                    constrains[ii][jj].append(cons[ii][jj])
                    positions[ii][jj].append(pos[ii][jj])

            #print(positions)
            imt_constrains[idx].append([vocab_tgt.ids2sent(c) for c in cons])
            bidirection = False
            if FLAGS.bidirection:
                bidirection = True
            with torch.no_grad():
                constrained_word_ids, positions = fixwords_beam_search(
                    fw_nmt_model=fw_nmt_model,
                    bw_nmt_model=bw_nmt_model,
                    beam_size=FLAGS.beam_size,
                    max_steps=FLAGS.max_steps,
                    src_seqs=x,
                    alpha=FLAGS.alpha,
                    constrains=constrains,
                    positions=positions,
                    last_sentences=last_sents,
                    imt_step=idx + 1,
                    bidirection=bidirection)
            constrained_word_ids = constrained_word_ids.cpu().numpy().tolist()
            last_sents = []
            for i, sent_t in enumerate(constrained_word_ids):
                sent_t = [[wid for wid in line if wid != PAD]
                          for line in sent_t]
                if idx == FLAGS.imt_step - 1:
                    imt_result.append(copy.deepcopy(sent_t))
                    imt_n_words += len(sent_t[0])
                samples = []
                for trans in sent_t:
                    sample = []
                    for w in trans:
                        if w == vocab_tgt.EOS:
                            break
                        sample.append(w)
                    samples.append(sample)

                sent_t = []
                for ii in range(len(samples)):
                    if ii % FLAGS.beam_size == 0:
                        sent_t.append(samples[ii])
                BLEU = []
                for sample in sent_t:
                    bleu, _ = bleuScore(sample, batch_ref[i])
                    BLEU.append(bleu)

                # print("BLEU: ", BLEU)
                order = np.argsort(BLEU).tolist()
                order = order[::-1]
                # print("order: ", order)
                sent_t = [sent_t[ii] for ii in order]

                last_sents.append(sent_t[0])

            if FLAGS.online_learning and idx == FLAGS.imt_step - 1:
                seqs_y = []
                for sent in last_sents:
                    sent = [BOS] + sent
                    seqs_y.append(sent)
                compute_forward(fw_nmt_model, critic, x,
                                torch.Tensor(seqs_y).long().cuda())
                seqs_y = [sent[::-1] for sent in seqs_y]
                for ii in range(len(seqs_y)):
                    seqs_y[ii][0] = BOS
                    seqs_y[ii][-1] = EOS
                compute_forward(bw_nmt_model, critic, x,
                                torch.Tensor(seqs_y).long().cuda())

        infer_progress_bar.update(batch_size_t)

    infer_progress_bar.close()
    INFO('Done. Speed: {0:.2f} words/sec'.format(
        n_words / (timer.toc(return_seconds=True))))

    translation = []
    for sent in result:
        samples = []
        for trans in sent:
            sample = []
            for w in trans:
                if w == vocab_tgt.EOS:
                    break
                sample.append(vocab_tgt.id2token(w))
            samples.append(vocab_tgt.tokenizer.detokenize(sample))
        translation.append(samples)

    origin_order = np.argsort(result_numbers).tolist()
    translation = [translation[ii] for ii in origin_order]

    keep_n = FLAGS.beam_size if FLAGS.keep_n <= 0 else min(
        FLAGS.beam_size, FLAGS.keep_n)
    outputs = ['%s.%d' % (FLAGS.saveto, i) for i in range(keep_n)]

    with batch_open(outputs, 'w') as handles:
        for trans in translation:
            for i in range(keep_n):
                if i < len(trans):
                    handles[i].write('%s\n' % trans[i])
                else:
                    handles[i].write('%s\n' % 'eos')

    imt_translation = []
    for sent in imt_result:
        samples = []
        for trans in sent:
            sample = []
            for w in trans:
                if w == vocab_tgt.EOS:
                    break
                sample.append(w)
            samples.append(sample)
        imt_translation.append(samples)

    origin_order = np.argsort(imt_numbers).tolist()
    imt_translation = [imt_translation[ii] for ii in origin_order]
    for idx in range(FLAGS.imt_step):
        imt_constrains[idx] = [
            ' '.join(imt_constrains[idx][ii]) + '\n' for ii in origin_order
        ]

        with open('%s.cons%d' % (FLAGS.saveto, idx), 'w') as f:
            f.writelines(imt_constrains[idx])

    bleu_translation = []
    for idx, sent in enumerate(imt_translation):
        samples = []
        for ii in range(len(sent)):
            if ii % FLAGS.beam_size == 0:
                samples.append(sent[ii])
        BLEU = []
        for sample in samples:
            bleu, _ = bleuScore(sample, valid_ref[idx])
            BLEU.append(bleu)

        #print("BLEU: ", BLEU)
        order = np.argsort(BLEU).tolist()
        order = order[::-1]
        #print("order: ", order)
        samples = [vocab_tgt.ids2sent(samples[ii]) for ii in order]
        bleu_translation.append(samples)

    #keep_n = FLAGS.beam_size*patience if FLAGS.keep_n <= 0 else min(FLAGS.beam_size*patience, FLAGS.keep_n)
    keep_n = patience
    outputs = ['%s.imt%d' % (FLAGS.saveto, i) for i in range(keep_n)]

    with batch_open(outputs, 'w') as handles:
        for trans in bleu_translation:
            for i in range(keep_n):
                if i < len(trans):
                    handles[i].write('%s\n' % trans[i])
                else:
                    handles[i].write('%s\n' % 'eos')