示例#1
0
    def prepare_model_and_optimizer(self):
        # Prepare model
        self.config = BertConfig.from_json_file(self.args.config_file)

        # Padding for divisibility by 8
        if self.config.vocab_size % 8 != 0:
            self.config.vocab_size += 8 - (self.config.vocab_size % 8)
        self.model = BertForPreTraining(self.config)
        self.another_model = BertForPreTraining(self.config)

        self.model.to(self.device)
        self.another_model.to(self.device)
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']

        optimizer_grouped_parameters = []
        names = []

        for n, p in param_optimizer:
            if not any(nd in n for nd in no_decay):
                optimizer_grouped_parameters.append({
                    'params': [p],
                    'weight_decay': 0.01,
                    'name': n
                })
                names.append({'params': [n], 'weight_decay': 0.01})
            if any(nd in n for nd in no_decay):
                optimizer_grouped_parameters.append({
                    'params': [p],
                    'weight_decay': 0.00,
                    'name': n
                })
                names.append({'params': [n], 'weight_decay': 0.00})

        if self.args.phase2:
            max_steps = self.args.max_steps
            tmp = max_steps * 10
            r = self.args.phase1_end_step / tmp
            lr = self.args.learning_rate * (1 - r)
        else:
            max_steps = int(self.args.max_steps / 9 * 10)
            lr = self.args.learning_rate
        if self.args.optimizer == "lamb":
            self.optimizer = BertLAMB(optimizer_grouped_parameters,
                                      lr=lr,
                                      warmup=self.args.warmup_proportion
                                      if not self.args.phase2 else -1,
                                      t_total=max_steps)
        elif self.args.optimizer == "adam":
            self.optimizer = BertAdam(optimizer_grouped_parameters,
                                      lr=lr,
                                      warmup=self.args.warmup_proportion
                                      if not self.args.phase2 else -1,
                                      t_total=max_steps)
示例#2
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                                     pytorch_dump_path):
    config_path = os.path.abspath(bert_config_file)
    tf_path = os.path.abspath(tf_checkpoint_path)
    print("Converting TensorFlow checkpoint from {} with config at {}".format(
        tf_path, config_path))
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
        print("Loading TF weight {} with shape {}".format(name, shape))
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array)

    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    for name, array in zip(names, arrays):
        name = name.split('/')
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(n in ["adam_v", "adam_m"] for n in name):
            print("Skipping {}".format("/".join(name)))
            continue
        pointer = model
        for m_name in name:
            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
                l = re.split(r'_(\d+)', m_name)
            else:
                l = [m_name]
            if l[0] == 'kernel' or l[0] == 'gamma':
                pointer = getattr(pointer, 'weight')
            elif l[0] == 'output_bias' or l[0] == 'beta':
                pointer = getattr(pointer, 'bias')
            elif l[0] == 'output_weights':
                pointer = getattr(pointer, 'weight')
            else:
                pointer = getattr(pointer, l[0])
            if len(l) >= 2:
                num = int(l[1])
                pointer = pointer[num]
        if m_name[-11:] == '_embeddings':
            pointer = getattr(pointer, 'weight')
        elif m_name == 'kernel':
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        print("Initialize PyTorch weight {}".format(name))
        pointer.data = torch.from_numpy(array)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
def convert_tmp_to_pytorch(bert_config_file, pytorch_dump_path):
    import torch
    from modeling import BertConfig, BertForPreTraining
    import pickle

    with open("tmp_names", "rb") as fp:  # Unpickling
        # names = pickle.load(fp, encoding='iso-8859-1')
        names = pickle.load(fp)
    with open("tmp_arrays", "rb") as fp:  # Unpickling
        # arrays = pickle.load(fp, encoding='iso-8859-1')
        arrays = pickle.load(fp)

    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    for name, array in zip(names, arrays):
        name = name.split('/')
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if name[-1] in ["adam_v", "adam_m", 'global_step']:
            print("Skipping {}".format("/".join(name)))
            continue
        pointer = model
        for m_name in name:
            if fullmatch(r'[A-Za-z]+_\d+', m_name):
                # if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
                l = re.split(r'_(\d+)', m_name)
            else:
                l = [m_name]
            if l[0] == 'kernel':
                pointer = getattr(pointer, 'weight')
            elif l[0] == 'output_bias':
                pointer = getattr(pointer, 'bias')
            elif l[0] == 'output_weights':
                pointer = getattr(pointer, 'weight')
            else:
                pointer = getattr(pointer, l[0])
            if len(l) >= 2:
                num = int(l[1])
                pointer = pointer[num]
        if m_name[-11:] == '_embeddings':
            pointer = getattr(pointer, 'weight')
        elif m_name == 'kernel':
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        print("Initialize PyTorch weight {}".format(name))
        pointer.data = torch.from_numpy(array)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
示例#4
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                                     pytorch_dump_path):
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
示例#5
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                                     pytorch_dump_path):

    # 加载模型参数
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))

    # 加载模型
    model = BertForPreTraining(config)

    # 加载检查点参数到模型中,进行处理
    # 但是有一个问题,为什么加不加返回model都能返回???猜测其为内部已进行处理
    load_tf_weights_in_bert(model, tf_checkpoint_path)
    print("Save PyTorch model to {}".format(pytorch_dump_path))

    # 保存pytorch的检查点
    torch.save(model.state_dict(), pytorch_dump_path)
示例#6
0
def main():

    print("IN NEW MAIN XD\n")
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--input_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain .hdf5 files  for the task.")

    parser.add_argument("--config_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The BERT model config")

    parser.add_argument(
        "--bert_model",
        default="bert-large-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )

    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--max_predictions_per_seq",
        default=80,
        type=int,
        help="The maximum total of masked tokens in input sequence")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps",
                        default=1000,
                        type=float,
                        help="Total number of training steps to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.01,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0.0,
        help=
        'Loss scaling, positive power of 2 values can improve fp16 convergence.'
    )
    parser.add_argument('--log_freq',
                        type=float,
                        default=50.0,
                        help='frequency of logging loss.')
    parser.add_argument('--checkpoint_activations',
                        default=False,
                        action='store_true',
                        help="Whether to use gradient checkpointing")
    parser.add_argument("--resume_from_checkpoint",
                        default=False,
                        action='store_true',
                        help="Whether to resume training from checkpoint.")
    parser.add_argument('--resume_step',
                        type=int,
                        default=-1,
                        help="Step to resume training from.")
    parser.add_argument(
        '--num_steps_per_checkpoint',
        type=int,
        default=2000,
        help="Number of update steps until a model checkpoint is saved to disk."
    )

    args = parser.parse_args()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    assert (torch.cuda.is_available())

    if args.local_rank == -1:
        device = torch.device("cuda")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    if args.train_batch_size % args.gradient_accumulation_steps != 0:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible"
            .format(args.gradient_accumulation_steps, args.train_batch_size))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if not args.resume_from_checkpoint and os.path.exists(
            args.output_dir) and (os.listdir(args.output_dir) and os.listdir(
                args.output_dir) != ['logfile.txt']):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))

    if not args.resume_from_checkpoint:
        os.makedirs(args.output_dir, exist_ok=True)

    # Prepare model
    config = BertConfig.from_json_file(args.config_file)
    model = BertForPreTraining(config)

    if not args.resume_from_checkpoint:
        global_step = 0
    else:
        if args.resume_step == -1:
            model_names = [
                f for f in os.listdir(args.output_dir) if f.endswith(".pt")
            ]
            args.resume_step = max([
                int(x.split('.pt')[0].split('_')[1].strip())
                for x in model_names
            ])

        global_step = args.resume_step

        checkpoint = torch.load(os.path.join(args.output_dir,
                                             "ckpt_{}.pt".format(global_step)),
                                map_location="cpu")
        model.load_state_dict(checkpoint['model'], strict=False)

        print("resume step from ", args.resume_step)

    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:

        optimizer = FusedAdam(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            #warmup=args.warmup_proportion,
            #t_total=args.max_steps,
            bias_correction=False,
            weight_decay=0.01,
            max_grad_norm=1.0)

        if args.loss_scale == 0:
            # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level="O2",
                                              keep_batchnorm_fp32=False,
                                              loss_scale="dynamic")
        else:
            # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level="O2",
                                              keep_batchnorm_fp32=False,
                                              loss_scale=args.loss_scale)

        scheduler = LinearWarmUpScheduler(optimizer,
                                          warmup=args.warmup_proportion,
                                          total_steps=args.max_steps)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=args.max_steps)

    if args.resume_from_checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer'])  # , strict=False)

    if args.local_rank != -1:
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    files = [
        os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
        if os.path.isfile(os.path.join(args.input_dir, f))
    ]
    files.sort()

    num_files = len(files)

    logger.info("***** Running training *****")
    # logger.info("  Num examples = %d", len(train_data))
    logger.info("  Batch size = %d", args.train_batch_size)
    print("  LR = ", args.learning_rate)

    model.train()
    print("Training. . .")

    most_recent_ckpts_paths = []

    print("Training. . .")
    tr_loss = 0.0  # total added training loss
    average_loss = 0.0  # averaged loss every args.log_freq steps
    epoch = 0
    training_steps = 0
    while True:
        if not args.resume_from_checkpoint:
            random.shuffle(files)
            f_start_id = 0
        else:
            f_start_id = checkpoint['files'][0]
            files = checkpoint['files'][1:]
            args.resume_from_checkpoint = False
        for f_id in range(f_start_id, len(files)):
            data_file = files[f_id]
            logger.info("file no %s file %s" % (f_id, data_file))
            train_data = pretraining_dataset(
                input_file=data_file,
                max_pred_length=args.max_predictions_per_seq)

            if args.local_rank == -1:
                train_sampler = RandomSampler(train_data)
                train_dataloader = DataLoader(
                    train_data,
                    sampler=train_sampler,
                    batch_size=args.train_batch_size * n_gpu,
                    num_workers=4,
                    pin_memory=True)
            else:
                train_sampler = DistributedSampler(train_data)
                train_dataloader = DataLoader(train_data,
                                              sampler=train_sampler,
                                              batch_size=args.train_batch_size,
                                              num_workers=4,
                                              pin_memory=True)

            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="File Iteration")):

                training_steps += 1
                batch = [t.to(device) for t in batch]
                input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch  #\
                loss = model(
                    input_ids=input_ids,
                    token_type_ids=segment_ids,
                    attention_mask=input_mask,
                    masked_lm_labels=masked_lm_labels,
                    next_sentence_label=next_sentence_labels,
                    checkpoint_activations=args.checkpoint_activations)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    #   optimizer.backward(loss)
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                tr_loss += loss
                average_loss += loss.item()

                if training_steps % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if training_steps == 1 * args.gradient_accumulation_steps:
                    logger.info(
                        "Step:{} Average Loss = {} Step Loss = {} LR {}".
                        format(global_step, average_loss, loss.item(),
                               optimizer.param_groups[0]['lr']))

                if training_steps % (args.log_freq *
                                     args.gradient_accumulation_steps) == 0:
                    logger.info(
                        "Step:{} Average Loss = {} Step Loss = {} LR {}".
                        format(global_step, average_loss / args.log_freq,
                               loss.item(), optimizer.param_groups[0]['lr']))
                    average_loss = 0

                if global_step >= args.max_steps or training_steps % (
                        args.num_steps_per_checkpoint *
                        args.gradient_accumulation_steps) == 0:

                    if (not torch.distributed.is_initialized()
                            or (torch.distributed.is_initialized()
                                and torch.distributed.get_rank() == 0)):
                        # Save a trained model
                        logger.info(
                            "** ** * Saving fine - tuned model ** ** * ")
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_save_file = os.path.join(
                            args.output_dir, "ckpt_{}.pt".format(global_step))

                        torch.save(
                            {
                                'model': model_to_save.state_dict(),
                                'optimizer': optimizer.state_dict(),
                                'files': [f_id] + files
                            }, output_save_file)

                        most_recent_ckpts_paths.append(output_save_file)
                        if len(most_recent_ckpts_paths) > 3:
                            ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
                            os.remove(ckpt_to_be_removed)

                    if global_step >= args.max_steps:
                        tr_loss = tr_loss * args.gradient_accumulation_steps / training_steps
                        if (torch.distributed.is_initialized()):
                            tr_loss /= torch.distributed.get_world_size()
                            torch.distributed.all_reduce(tr_loss)
                        logger.info("Total Steps:{} Final Loss = {}".format(
                            training_steps, tr_loss.item()))
                        return
            del train_dataloader
            del train_sampler
            del train_data
            #for obj in gc.get_objects():
            #  if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            #    del obj

            torch.cuda.empty_cache()
        epoch += 1
示例#7
0
def inference(args):
    start_t = time.time()
    bert_module = BertForPreTraining(
        args.vocab_size,
        args.seq_length,
        args.hidden_size,
        args.num_hidden_layers,
        args.num_attention_heads,
        args.intermediate_size,
        nn.GELU(),
        args.hidden_dropout_prob,
        args.attention_probs_dropout_prob,
        args.max_position_embeddings,
        args.type_vocab_size,
        args.vocab_size,
    )
    end_t = time.time()
    print("Initialize model using time: {:.3f}s".format(end_t - start_t))

    start_t = time.time()
    if args.use_lazy_model:
        from utils.compare_lazy_outputs import load_params_from_lazy

        load_params_from_lazy(
            bert_module.state_dict(),
            args.model_path,
        )
    else:
        bert_module.load_state_dict(flow.load(args.model_path))
    end_t = time.time()
    print("Loading parameters using time: {:.3f}s".format(end_t - start_t))

    bert_module.eval()
    bert_module.to(args.device)

    class BertEvalGraph(nn.Graph):
        def __init__(self):
            super().__init__()
            self.bert = bert_module

        def build(self, input_ids, input_masks, segment_ids):
            input_ids = input_ids.to(device=args.device)
            input_masks = input_masks.to(device=args.device)
            segment_ids = segment_ids.to(device=args.device)

            with flow.no_grad():
                # 1. forward the next_sentence_prediction and masked_lm model
                _, seq_relationship_scores = self.bert(input_ids, input_masks,
                                                       segment_ids)

            return seq_relationship_scores

    bert_eval_graph = BertEvalGraph()

    start_t = time.time()
    inputs = [np.random.randint(0, 20, size=args.seq_length)]
    inputs = flow.Tensor(inputs,
                         dtype=flow.int64,
                         device=flow.device(args.device))
    mask = flow.cast(inputs > 0, dtype=flow.int64)

    segment_info = flow.zeros_like(inputs)
    prediction = bert_eval_graph(inputs, mask, segment_info)
    print(prediction.numpy())
    end_t = time.time()
    print("Inference using time: {:.3f}".format(end_t - start_t))
示例#8
0
def main():

    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--input_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain .hdf5 files  for the task.")
    parser.add_argument("--config_file",
                        default="bert_config.json",
                        type=str,
                        required=False,
                        help="The BERT model config")
    ckpt_group = parser.add_mutually_exclusive_group(required=True)
    ckpt_group.add_argument("--ckpt_dir",
                            default=None,
                            type=str,
                            help="The ckpt directory, e.g. /results")
    ckpt_group.add_argument("--ckpt_path",
                            default=None,
                            type=str,
                            help="Path to the specific checkpoint")

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--eval', dest='do_eval', action='store_true')
    group.add_argument('--prediction', dest='do_eval', action='store_false')
    ## Other parameters
    parser.add_argument(
        "--bert_model",
        default="bert-large-uncased",
        type=str,
        required=False,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--max_predictions_per_seq",
        default=80,
        type=int,
        help="The maximum total of masked tokens in input sequence")
    parser.add_argument("--ckpt_step",
                        default=-1,
                        type=int,
                        required=False,
                        help="The model checkpoint iteration, e.g. 1000")

    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "Total number of eval  steps to perform, otherwise use full dataset")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument("--log_path",
                        help="Out file for DLLogger",
                        default="/workspace/dllogger_inference.out",
                        type=str)

    args = parser.parse_args()

    if 'LOCAL_RANK' in os.environ:
        args.local_rank = int(os.environ['LOCAL_RANK'])

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")

    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    if is_main_process():
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        assert (args.local_rank != -1
                )  # only use torch.distributed for multi-gpu

    dllogger.log(
        step=
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16),
        data={})

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Prepare model
    config = BertConfig.from_json_file(args.config_file)
    # Padding for divisibility by 8
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)
    model = BertForPreTraining(config)

    if args.ckpt_dir:
        if args.ckpt_step == -1:
            #retrieve latest model
            model_names = [
                f for f in os.listdir(args.ckpt_dir) if f.endswith(".pt")
            ]
            args.ckpt_step = max([
                int(x.split('.pt')[0].split('_')[1].strip())
                for x in model_names
            ])
            dllogger.log(step="load model saved at iteration",
                         data={"number": args.ckpt_step})
        model_file = os.path.join(args.ckpt_dir,
                                  "ckpt_" + str(args.ckpt_step) + ".pt")
    else:
        model_file = args.ckpt_path
    state_dict = torch.load(model_file, map_location="cpu")["model"]
    model.load_state_dict(state_dict, strict=False)

    if args.fp16:
        model.half(
        )  # all parameters and buffers are converted to half precision
    model.to(device)

    multi_gpu_training = args.local_rank != -1 and torch.distributed.is_initialized(
    )
    if multi_gpu_training:
        model = DDP(model)

    files = [
        os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
        if os.path.isfile(os.path.join(args.input_dir, f)) and 'test' in f
    ]
    files.sort()

    dllogger.log(step="***** Running Inference *****", data={})
    dllogger.log(step="  Inference batch", data={"size": args.eval_batch_size})

    model.eval()

    nb_instances = 0
    max_steps = args.max_steps if args.max_steps > 0 else np.inf
    global_step = 0
    total_samples = 0

    begin_infer = time.time()
    with torch.no_grad():
        if args.do_eval:
            final_loss = 0.0  #
            for data_file in files:
                dllogger.log(step="Opening ", data={"file": data_file})
                dataset = pretraining_dataset(
                    input_file=data_file,
                    max_pred_length=args.max_predictions_per_seq)
                if not multi_gpu_training:
                    train_sampler = RandomSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)
                else:
                    train_sampler = DistributedSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)
                for step, batch in enumerate(
                        tqdm(datasetloader, desc="Iteration")):
                    if global_step > max_steps:
                        break
                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch  #\
                    loss = model(input_ids=input_ids,
                                 token_type_ids=segment_ids,
                                 attention_mask=input_mask,
                                 masked_lm_labels=masked_lm_labels,
                                 next_sentence_label=next_sentence_labels)
                    final_loss += loss.item()

                    global_step += 1

                total_samples += len(datasetloader)
                torch.cuda.empty_cache()
                if global_step > max_steps:
                    break
            final_loss /= global_step
            if multi_gpu_training:
                final_loss = torch.tensor(final_loss, device=device)
                dist.all_reduce(final_loss)
                final_loss /= torch.distributed.get_world_size()
            if (not multi_gpu_training or
                (multi_gpu_training and torch.distributed.get_rank() == 0)):
                dllogger.log(step="Inference Loss",
                             data={"final_loss": final_loss.item()})

        else:  # inference
            # if multi_gpu_training:
            #     torch.distributed.barrier()
            # start_t0 = time.time()
            for data_file in files:
                dllogger.log(step="Opening ", data={"file": data_file})
                dataset = pretraining_dataset(
                    input_file=data_file,
                    max_pred_length=args.max_predictions_per_seq)
                if not multi_gpu_training:
                    train_sampler = RandomSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)
                else:
                    train_sampler = DistributedSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)

                for step, batch in enumerate(
                        tqdm(datasetloader, desc="Iteration")):
                    if global_step > max_steps:
                        break

                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch  #\

                    lm_logits, nsp_logits = model(input_ids=input_ids,
                                                  token_type_ids=segment_ids,
                                                  attention_mask=input_mask,
                                                  masked_lm_labels=None,
                                                  next_sentence_label=None)

                    nb_instances += input_ids.size(0)
                    global_step += 1

                total_samples += len(datasetloader)
                torch.cuda.empty_cache()
                if global_step > max_steps:
                    break
            # if multi_gpu_training:
            #     torch.distributed.barrier()
            if (not multi_gpu_training or
                (multi_gpu_training and torch.distributed.get_rank() == 0)):
                dllogger.log(step="Done Inferring on samples", data={})

    end_infer = time.time()
    dllogger.log(step="Inference perf",
                 data={
                     "inference_sequences_per_second":
                     total_samples * args.eval_batch_size /
                     (end_infer - begin_infer)
                 })
def prepare_model_and_optimizer(args, device):

    # Prepare model
    config = BertConfig.from_json_file(args.config_file)

    # Padding for divisibility by 8
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)
    model = BertForPreTraining(config)

    checkpoint = None
    if not args.resume_from_checkpoint:
        global_step = 0
    else:
        if args.resume_step == -1 and not args.init_checkpoint:
            model_names = [
                f for f in os.listdir(args.output_dir) if f.endswith(".pt")
            ]
            args.resume_step = max([
                int(x.split('.pt')[0].split('_')[1].strip())
                for x in model_names
            ])

        global_step = args.resume_step if not args.init_checkpoint else 0

        if not args.init_checkpoint:
            checkpoint = torch.load(os.path.join(
                args.output_dir, "ckpt_{}.pt".format(global_step)),
                                    map_location="cpu")
        else:
            checkpoint = torch.load(args.init_checkpoint, map_location="cpu")

        model.load_state_dict(checkpoint['model'], strict=False)
        if args.phase2:
            global_step -= args.phase1_end_step
        if is_main_process():
            print("resume step from ", args.resume_step)

    model.to(device)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']

    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = FusedLAMB(optimizer_grouped_parameters, lr=args.learning_rate)
    lr_scheduler = PolyWarmUpScheduler(optimizer,
                                       warmup=args.warmup_proportion,
                                       total_steps=args.max_steps)
    if args.fp16:

        if args.loss_scale == 0:
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level="O2",
                                              loss_scale="dynamic")
        else:
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level="O2",
                                              loss_scale=args.loss_scale)
        amp._amp_state.loss_scalers[0]._loss_scale = 2**20

    if args.resume_from_checkpoint:
        if args.phase2 or args.init_checkpoint:
            keys = list(checkpoint['optimizer']['state'].keys())
            #Override hyperparameters from previous checkpoint
            for key in keys:
                checkpoint['optimizer']['state'][key]['step'] = global_step
            for iter, item in enumerate(
                    checkpoint['optimizer']['param_groups']):
                checkpoint['optimizer']['param_groups'][iter][
                    'step'] = global_step
                checkpoint['optimizer']['param_groups'][iter][
                    't_total'] = args.max_steps
                checkpoint['optimizer']['param_groups'][iter][
                    'warmup'] = args.warmup_proportion
                checkpoint['optimizer']['param_groups'][iter][
                    'lr'] = args.learning_rate
        optimizer.load_state_dict(checkpoint['optimizer'])  # , strict=False)

        # Restore AMP master parameters
        if args.fp16:
            optimizer._lazy_init_maybe_master_weights()
            optimizer._amp_stash.lazy_init_called = True
            optimizer.load_state_dict(checkpoint['optimizer'])
            for param, saved_param in zip(amp.master_params(optimizer),
                                          checkpoint['master params']):
                param.data.copy_(saved_param.data)

    if args.local_rank != -1:
        if not args.allreduce_post_accumulation:
            model = DDP(
                model,
                message_size=250000000,
                gradient_predivide_factor=torch.distributed.get_world_size())
        else:
            flat_dist_call([param.data for param in model.parameters()],
                           torch.distributed.broadcast, (0, ))
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    return model, optimizer, lr_scheduler, checkpoint, global_step
示例#10
0
def prepare_model_and_optimizer(args, device):

    # Prepare model
    config = BertConfig.from_json_file(args.config_file)

    # Padding for divisibility by 8
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)
    model = BertForPreTraining(config)

    checkpoint = None
    if not args.resume_from_checkpoint:
        global_step = 0
    else:
        if args.resume_step == -1:
            model_names = [
                f for f in os.listdir(args.output_dir) if f.endswith(".pt")
            ]
            args.resume_step = max([
                int(x.split(".pt")[0].split("_")[1].strip())
                for x in model_names
            ])
        global_step = args.resume_step

        checkpoint = torch.load(os.path.join(args.output_dir,
                                             "ckpt_{}.pt".format(global_step)),
                                map_location="cpu")
        model.load_state_dict(checkpoint["model"], strict=False)
        if args.phase2:
            global_step -= args.phase1_end_step
        if is_main_process():
            print("resume step from ", args.resume_step)

    model.to(device)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "gamma", "beta", "LayerNorm"]

    optimizer_grouped_parameters = []
    names = []

    count = 1
    for n, p in param_optimizer:
        count += 1
        if not any(nd in n for nd in no_decay):
            optimizer_grouped_parameters.append({
                "params": [p],
                "weight_decay": 0.01,
                "name": n
            })
            names.append({"params": [n], "weight_decay": 0.01})
        if any(nd in n for nd in no_decay):
            optimizer_grouped_parameters.append({
                "params": [p],
                "weight_decay": 0.00,
                "name": n
            })
            names.append({"params": [n], "weight_decay": 0.00})

    optimizer = BertLAMB(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=args.max_steps)
    if args.fp16:

        if args.loss_scale == 0:
            # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            model, optimizer = amp.initialize(
                model,
                optimizer,
                opt_level="O2",
                loss_scale="dynamic",
                master_weights=False if args.accumulate_into_fp16 else True,
            )
        else:
            # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
            model, optimizer = amp.initialize(
                model,
                optimizer,
                opt_level="O2",
                loss_scale=args.loss_scale,
                master_weights=False if args.accumulate_into_fp16 else True,
            )
        amp._amp_state.loss_scalers[0]._loss_scale = 2**20

    if args.resume_from_checkpoint:
        if args.phase2:
            keys = list(checkpoint["optimizer"]["state"].keys())
            # Override hyperparameters from Phase 1
            for key in keys:
                checkpoint["optimizer"]["state"][key]["step"] = global_step
            for iter, item in enumerate(
                    checkpoint["optimizer"]["param_groups"]):
                checkpoint["optimizer"]["param_groups"][iter][
                    "t_total"] = args.max_steps
                checkpoint["optimizer"]["param_groups"][iter][
                    "warmup"] = args.warmup_proportion
                checkpoint["optimizer"]["param_groups"][iter][
                    "lr"] = args.learning_rate
        optimizer.load_state_dict(checkpoint["optimizer"])  # , strict=False)

        # Restore AMP master parameters
        if args.fp16:
            optimizer._lazy_init_maybe_master_weights()
            optimizer._amp_stash.lazy_init_called = True
            optimizer.load_state_dict(checkpoint["optimizer"])
            for param, saved_param in zip(amp.master_params(optimizer),
                                          checkpoint["master params"]):
                param.data.copy_(saved_param.data)

    if args.local_rank != -1:
        if not args.allreduce_post_accumulation:
            model = DDP(
                model,
                message_size=250000000,
                gradient_predivide_factor=torch.distributed.get_world_size())
        else:
            flat_dist_call([param.data for param in model.parameters()],
                           torch.distributed.broadcast, (0, ))
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    return model, optimizer, checkpoint, global_step
示例#11
0
def main():

    print("IN NEW MAIN XD\n")
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--input_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain .hdf5 files  for the task.")
    parser.add_argument("--config_file",
                        default="bert_config.json",
                        type=str,
                        required=False,
                        help="The BERT model config")
    parser.add_argument("--ckpt_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The ckpt directory, e.g. /results")

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--eval', dest='do_eval', action='store_true')
    group.add_argument('--prediction', dest='do_eval', action='store_false')
    ## Other parameters
    parser.add_argument(
        "--bert_model",
        default="bert-large-uncased",
        type=str,
        required=False,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--max_predictions_per_seq",
        default=80,
        type=int,
        help="The maximum total of masked tokens in input sequence")
    parser.add_argument("--ckpt_step",
                        default=-1,
                        type=int,
                        required=False,
                        help="The model checkpoint iteration, e.g. 1000")

    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "Total number of eval  steps to perform, otherwise use full dataset")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")

    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        assert (args.local_rank != -1
                )  # only use torch.distributed for multi-gpu
    logger.info("device %s n_gpu %d distributed inference %r", device, n_gpu,
                bool(args.local_rank != -1))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Prepare model
    config = BertConfig.from_json_file(args.config_file)
    model = BertForPreTraining(config)

    if args.ckpt_step == -1:
        #retrieve latest model
        model_names = [
            f for f in os.listdir(args.ckpt_dir) if f.endswith(".model")
        ]
        args.ckpt_step = max([
            int(x.split('.model')[0].split('_')[1].strip())
            for x in model_names
        ])
        print("load model saved at iteraton", args.ckpt_step)
    model_file = os.path.join(args.ckpt_dir,
                              "ckpt_" + str(args.ckpt_step) + ".model")
    state_dict = torch.load(model_file, map_location="cpu")
    model.load_state_dict(state_dict, strict=False)

    if args.fp16:
        model.half(
        )  # all parameters and buffers are converted to half precision
    model.to(device)

    multi_gpu_training = args.local_rank != -1 and torch.distributed.is_initialized(
    )
    if multi_gpu_training:
        model = DDP(model)

    files = [
        os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
        if os.path.isfile(os.path.join(args.input_dir, f))
    ]
    files.sort()

    logger.info("***** Running evaluation *****")
    logger.info("  Batch size = %d", args.eval_batch_size)

    model.eval()
    print("Evaluation. . .")

    nb_instances = 0
    max_steps = args.max_steps if args.max_steps > 0 else np.inf
    global_step = 0

    with torch.no_grad():
        if args.do_eval:
            final_loss = 0.0  #
            for data_file in files:
                logger.info("file %s" % (data_file))
                dataset = pretraining_dataset(
                    input_file=data_file,
                    max_pred_length=args.max_predictions_per_seq)
                if not multi_gpu_training:
                    train_sampler = RandomSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)
                else:
                    train_sampler = DistributedSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)
                for step, batch in enumerate(
                        tqdm(datasetloader, desc="Iteration")):
                    if global_step > max_steps:
                        break

                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch  #\
                    loss = model(input_ids=input_ids,
                                 token_type_ids=segment_ids,
                                 attention_mask=input_mask,
                                 masked_lm_labels=masked_lm_labels,
                                 next_sentence_label=next_sentence_labels)
                    final_loss += loss

                    global_step += 1

                torch.cuda.empty_cache()
                if global_step > max_steps:
                    break
            final_loss /= global_step
            if multi_gpu_training:
                final_loss /= torch.distributed.get_world_size()
                dist.all_reduce(final_loss)
            if (not multi_gpu_training or
                (multi_gpu_training and torch.distributed.get_rank() == 0)):
                logger.info("Finished: Final Loss = {}".format(final_loss))

        else:  # inference
            # if multi_gpu_training:
            #     torch.distributed.barrier()
            # start_t0 = time.time()
            for data_file in files:
                logger.info("file %s" % (data_file))
                dataset = pretraining_dataset(
                    input_file=data_file,
                    max_pred_length=args.max_predictions_per_seq)
                if not multi_gpu_training:
                    train_sampler = RandomSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)
                else:
                    train_sampler = DistributedSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)
                for step, batch in enumerate(
                        tqdm(datasetloader, desc="Iteration")):
                    if global_step > max_steps:
                        break

                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch  #\

                    lm_logits, nsp_logits = model(input_ids=input_ids,
                                                  token_type_ids=segment_ids,
                                                  attention_mask=input_mask,
                                                  masked_lm_labels=None,
                                                  next_sentence_label=None)

                    nb_instances += input_ids.size(0)

                    global_step += 1
                torch.cuda.empty_cache()
                if global_step > max_steps:
                    break
            # if multi_gpu_training:
            #     torch.distributed.barrier()
            if (not multi_gpu_training or
                (multi_gpu_training and torch.distributed.get_rank() == 0)):
                logger.info("Finished")
def prepare_model_and_optimizer(args, device):
    global_step = 0
    args.resume_step = 0
    checkpoint = None

    config = BertConfig.from_json_file(args.bert_config_path)
    config.fused_mha = args.fused_mha
    config.fused_gelu_bias = args.fused_gelu_bias
    config.dense_seq_output = args.dense_seq_output
    config.unpad = args.unpad
    config.pad = args.pad
    config.fuse_qkv = not args.disable_fuse_qkv
    config.fuse_scale = not args.disable_fuse_scale
    config.fuse_mask = not args.disable_fuse_mask
    config.fuse_dropout = args.enable_fuse_dropout
    config.apex_softmax = not args.disable_apex_softmax
    config.enable_stream = args.enable_stream
    if config.fuse_mask == True: config.apex_softmax = True
    if config.pad == False: config.enable_stream = True
    if config.unpad == True: config.fused_mha = False

    # Padding for divisibility by 8
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)

    # Load from Pyt checkpoint - either given as init_checkpoint, or picked up from output_dir if found
    if args.init_checkpoint is not None or found_resume_checkpoint(args):
        # Prepare model

        model = BertForPreTraining(config)
        if args.init_checkpoint is None: # finding checkpoint in output_dir
            checkpoint_str = "phase2_ckpt_*.pt" if args.phase2 else "phase1_ckpt_*.pt"
            model_names = [f for f in glob.glob(os.path.join(args.output_dir, checkpoint_str))]
            global_step = max([int(x.split('.pt')[0].split('_')[-1].strip()) for x in model_names])
            args.resume_step = global_step #used for throughput computation

            resume_init_checkpoint = os.path.join(args.output_dir, checkpoint_str.replace("*", str(global_step)))
            print("Setting init checkpoint to %s - which is the latest in %s" %(resume_init_checkpoint, args.output_dir))
            checkpoint=torch.load(resume_init_checkpoint, map_location="cpu")
        else:
            checkpoint=torch.load(args.init_checkpoint, map_location="cpu")["model"]

        # Fused MHA requires a remapping of checkpoint parameters
        if config.fused_mha:
            checkpoint_remapped = remap_attn_parameters(checkpoint)
            model.load_state_dict(checkpoint_remapped, strict=False)
        else:
            model.load_state_dict(checkpoint, strict=True)
    else: #Load from TF Checkpoint
        model = BertForPreTraining.from_pretrained(args.init_tf_checkpoint, from_tf=True, config=config)


    model.to(device)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay_rate},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    mlperf_logger.log_event(key=mlperf_logger.constants.OPT_BASE_LR,
                            value=args.learning_rate, sync=False)
    optimizer = FusedLAMB(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          betas=(args.opt_lamb_beta_1, args.opt_lamb_beta_2))
    mlperf_logger.log_event(key='opt_epsilon', value=optimizer.defaults['eps'],
                            sync=False)
    b1, b2 = optimizer.defaults['betas']
    mlperf_logger.log_event(key='opt_lamb_beta_1', value=b1, sync=False)
    mlperf_logger.log_event(key='opt_lamb_beta_2', value=b2, sync=False)
    mlperf_logger.log_event(key='opt_lamb_weight_decay_rate',
                            value=optimizer.defaults['weight_decay'],
                            sync=False)

    if args.warmup_steps == 0:
        warmup_steps = int(args.max_steps * args.warmup_proportion)
        warmup_start = 0
    else:
        warmup_steps = args.warmup_steps
        warmup_start = args.start_warmup_step
    lr_scheduler = LinearWarmupPolyDecayScheduler(optimizer, start_warmup_steps=warmup_start, warmup_steps=warmup_steps,
                                                  total_steps=args.max_steps, end_learning_rate=0.0, degree=1.0)
    
                           
    if args.fp16:

        if args.loss_scale == 0:
            model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic")
        else:
            model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale)
        amp._amp_state.loss_scalers[0]._loss_scale = float(os.getenv("INIT_LOSS_SCALE", 2**20))


    if found_resume_checkpoint(args):
        optimizer.load_state_dict(checkpoint['optimizer']) #restores m,v states (only if resuming checkpoint, not for init_checkpoint and init_tf_checkpoint for now)

        # Restore AMP master parameters          
        if args.fp16:
            optimizer._lazy_init_maybe_master_weights()
            optimizer._amp_stash.lazy_init_called = True
            optimizer.load_state_dict(checkpoint['optimizer'])
            for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']):
                param.data.copy_(saved_param.data)

    if args.local_rank != -1:
        if not args.allreduce_post_accumulation:
            model = DDP(model, message_size=250000000, gradient_predivide_factor=torch.distributed.get_world_size())
        else:
            flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) )

    return model, optimizer, lr_scheduler, checkpoint, global_step
示例#13
0
print(f'input_ids: {input_ids.shape}')
print(f'segment_ids: {segment_ids.shape}')
print(f'input_mask: {input_mask.shape}')
print(f'masked_lm_labels: {masked_lm_labels.shape}')
print(f'next_sentence_labels: {next_sentence_labels.shape}')

# Load model
config = BertConfig.from_json_file(args.config_file)

# We skip padding for consistency with the HuggingFace repository
# if config.vocab_size % 8 != 0:
#     config.vocab_size += 8 - (config.vocab_size % 8)

# noinspection PyUnresolvedReferences
model = BertForPreTraining(config).cuda()

loss = model(input_ids=input_ids,
             token_type_ids=segment_ids,
             attention_mask=input_mask,
             masked_lm_labels=masked_lm_labels,
             next_sentence_label=next_sentence_labels,
             checkpoint_activations=args.checkpoint_activations)

flops, params = clever_format(
    profile(
        model,
        inputs=batch,
        custom_ops={
            Embedding: None,  # TODO: custom operator: Embedding
            FusedLayerNorm: None,  # TODO: custom operator: FusedLayerNorm
示例#14
0
def main():
    args = get_config()

    world_size = flow.env.get_world_size()
    if args.train_global_batch_size is None:
        args.train_global_batch_size = args.train_batch_size * world_size
    else:
        assert args.train_global_batch_size % args.train_batch_size == 0

    if args.val_global_batch_size is None:
        args.val_global_batch_size = args.val_batch_size * world_size
    else:
        assert args.val_global_batch_size % args.val_batch_size == 0

    flow.boxing.nccl.set_fusion_threshold_mbytes(args.nccl_fusion_threshold_mb)
    flow.boxing.nccl.set_fusion_max_ops_num(args.nccl_fusion_max_ops)

    if args.with_cuda:
        device = "cuda"
    else:
        device = "cpu"

    print("Device is: ", device)

    print("Creating Dataloader")
    train_data_loader = OfRecordDataLoader(
        ofrecord_dir=args.ofrecord_path,
        mode="train",
        dataset_size=args.train_dataset_size,
        batch_size=args.train_global_batch_size,
        data_part_num=args.train_data_part,
        seq_length=args.seq_length,
        max_predictions_per_seq=args.max_predictions_per_seq,
        consistent=args.use_consistent,
    )

    test_data_loader = OfRecordDataLoader(
        ofrecord_dir=args.ofrecord_path,
        mode="test",
        dataset_size=1024,
        batch_size=args.val_global_batch_size,
        data_part_num=4,
        seq_length=args.seq_length,
        max_predictions_per_seq=args.max_predictions_per_seq,
        consistent=args.use_consistent,
    )

    print("Building BERT Model")
    hidden_size = 64 * args.num_attention_heads
    intermediate_size = 4 * hidden_size
    bert_model = BertForPreTraining(
        args.vocab_size,
        args.seq_length,
        hidden_size,
        args.num_hidden_layers,
        args.num_attention_heads,
        intermediate_size,
        nn.GELU(),
        args.hidden_dropout_prob,
        args.attention_probs_dropout_prob,
        args.max_position_embeddings,
        args.type_vocab_size,
    )

    # Load the same initial parameters with lazy model.
    # from utils.compare_lazy_outputs import load_params_from_lazy
    # load_params_from_lazy(
    #     bert_model.state_dict(),
    #     "../../OneFlow-Benchmark/LanguageModeling/BERT/initial_model",
    # )

    assert id(bert_model.cls.predictions.decoder.weight) == id(
        bert_model.bert.embeddings.word_embeddings.weight
    )

    ns_criterion = nn.CrossEntropyLoss(reduction="mean")
    mlm_criterion = nn.CrossEntropyLoss(reduction="none")

    if args.use_consistent:
        placement = flow.env.all_device_placement("cuda")
        bert_model = bert_model.to_consistent(
            placement=placement, sbp=flow.sbp.broadcast
        )
    else:
        bert_model.to(device)
        ns_criterion.to(device)
        mlm_criterion.to(device)

    optimizer = build_optimizer(
        args.optim_name,
        bert_model,
        args.lr,
        args.weight_decay,
        weight_decay_excludes=["bias", "LayerNorm", "layer_norm"],
        clip_grad_max_norm=1,
        clip_grad_norm_type=2.0,
    )

    steps = args.epochs * len(train_data_loader)
    warmup_steps = int(steps * args.warmup_proportion)

    lr_scheduler = PolynomialLR(optimizer, steps=steps, end_learning_rate=0.0)

    lr_scheduler = flow.optim.lr_scheduler.WarmUpLR(
        lr_scheduler, warmup_factor=0, warmup_iters=warmup_steps, warmup_method="linear"
    )

    def get_masked_lm_loss(
        logit, masked_lm_labels, label_weights, max_predictions_per_seq,
    ):

        label_id = flow.reshape(masked_lm_labels, [-1])

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        pre_example_loss = mlm_criterion(logit, label_id)
        pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq])
        numerator = flow.sum(pre_example_loss * label_weights)
        denominator = flow.sum(label_weights) + 1e-5
        loss = numerator / denominator
        return loss

    class BertGraph(nn.Graph):
        def __init__(self):
            super().__init__()
            self.bert = bert_model
            self.ns_criterion = ns_criterion
            self.masked_lm_criterion = partial(
                get_masked_lm_loss, max_predictions_per_seq=args.max_predictions_per_seq
            )
            self.add_optimizer(optimizer, lr_sch=lr_scheduler)
            self._train_data_loader = train_data_loader
            if args.grad_acc_steps > 1:
                self.config.set_gradient_accumulation_steps(args.grad_acc_steps)
            if args.use_fp16:
                self.config.enable_amp(True)
                grad_scaler = flow.amp.GradScaler(
                    init_scale=2 ** 30,
                    growth_factor=2.0,
                    backoff_factor=0.5,
                    growth_interval=2000,
                )
                self.set_grad_scaler(grad_scaler)
            self.config.allow_fuse_add_to_output(True)
            self.config.allow_fuse_model_update_ops(True)

        def build(self):

            (
                input_ids,
                next_sentence_labels,
                input_mask,
                segment_ids,
                masked_lm_ids,
                masked_lm_positions,
                masked_lm_weights,
            ) = self._train_data_loader()
            input_ids = input_ids.to(device=device)
            input_mask = input_mask.to(device=device)
            segment_ids = segment_ids.to(device=device)
            next_sentence_labels = next_sentence_labels.to(device=device)
            masked_lm_ids = masked_lm_ids.to(device=device)
            masked_lm_positions = masked_lm_positions.to(device=device)
            masked_lm_weights = masked_lm_weights.to(device=device)

            # 1. forward the next_sentence_prediction and masked_lm model
            prediction_scores, seq_relationship_scores = self.bert(
                input_ids, segment_ids, input_mask, masked_lm_positions
            )

            # 2-1. loss of is_next classification result
            next_sentence_loss = self.ns_criterion(
                seq_relationship_scores.reshape(-1, 2), next_sentence_labels.reshape(-1)
            )

            masked_lm_loss = self.masked_lm_criterion(
                prediction_scores, masked_lm_ids, masked_lm_weights
            )

            total_loss = masked_lm_loss + next_sentence_loss

            total_loss.backward()
            return (
                seq_relationship_scores,
                next_sentence_labels,
                total_loss,
                masked_lm_loss,
                next_sentence_loss,
            )

    bert_graph = BertGraph()

    class BertEvalGraph(nn.Graph):
        def __init__(self):
            super().__init__()
            self.bert = bert_model
            self._test_data_loader = test_data_loader
            self.config.allow_fuse_add_to_output(True)

        def build(self):
            (
                input_ids,
                next_sent_labels,
                input_masks,
                segment_ids,
                masked_lm_ids,
                masked_lm_positions,
                masked_lm_weights,
            ) = self._test_data_loader()
            input_ids = input_ids.to(device=device)
            input_masks = input_masks.to(device=device)
            segment_ids = segment_ids.to(device=device)
            next_sent_labels = next_sent_labels.to(device=device)
            masked_lm_ids = masked_lm_ids.to(device=device)
            masked_lm_positions = masked_lm_positions.to(device)

            with flow.no_grad():
                # 1. forward the next_sentence_prediction and masked_lm model
                _, seq_relationship_scores = self.bert(
                    input_ids, input_masks, segment_ids
                )

            return seq_relationship_scores, next_sent_labels

    bert_eval_graph = BertEvalGraph()

    train_total_losses = []

    for epoch in range(args.epochs):
        metric = Metric(
            desc="bert pretrain",
            print_steps=args.loss_print_every_n_iters,
            batch_size=args.train_global_batch_size * args.grad_acc_steps,
            keys=["total_loss", "mlm_loss", "nsp_loss", "pred_acc"],
        )

        # Train
        bert_model.train()

        for step in range(len(train_data_loader)):
            bert_outputs = pretrain(bert_graph, args.metric_local)

            if flow.env.get_rank() == 0:
                metric.metric_cb(step, epoch=epoch)(bert_outputs)

            train_total_losses.append(bert_outputs["total_loss"])

    # Eval
    bert_model.eval()
    val_acc = validation(
        epoch,
        len(test_data_loader),
        bert_eval_graph,
        args.val_print_every_n_iters,
        args.metric_local,
    )

    save_model(bert_model, args.checkpoint_path, epoch, val_acc, args.use_consistent)
示例#15
0
def main():

    args = get_config()

    if args.with_cuda:
        device = flow.device("cuda")
    else:
        device = flow.device("cpu")

    print("Creating Dataloader")
    train_data_loader = OfRecordDataLoader(
        ofrecord_dir=args.ofrecord_path,
        mode="train",
        dataset_size=args.train_dataset_size,
        batch_size=args.train_batch_size,
        data_part_num=args.train_data_part,
        seq_length=args.seq_length,
        max_predictions_per_seq=args.max_predictions_per_seq,
        consistent=False,
    )

    test_data_loader = OfRecordDataLoader(
        ofrecord_dir=args.ofrecord_path,
        mode="test",
        dataset_size=1024,
        batch_size=args.val_batch_size,
        data_part_num=4,
        seq_length=args.seq_length,
        max_predictions_per_seq=args.max_predictions_per_seq,
        consistent=False,
    )

    print("Building BERT Model")
    hidden_size = 64 * args.num_attention_heads
    intermediate_size = 4 * hidden_size
    bert_model = BertForPreTraining(
        args.vocab_size,
        args.seq_length,
        hidden_size,
        args.num_hidden_layers,
        args.num_attention_heads,
        intermediate_size,
        nn.GELU(),
        args.hidden_dropout_prob,
        args.attention_probs_dropout_prob,
        args.max_position_embeddings,
        args.type_vocab_size,
    )

    # Load the same initial parameters with lazy model.
    # from utils.compare_lazy_outputs import load_params_from_lazy
    # load_params_from_lazy(
    #     bert_model.state_dict(),
    #     "../../OneFlow-Benchmark/LanguageModeling/BERT/initial_model",
    # )

    bert_model = bert_model.to(device)
    if args.use_ddp:
        bert_model = ddp(bert_model)

    optimizer = build_optimizer(
        args.optim_name,
        bert_model,
        args.lr,
        args.weight_decay,
        weight_decay_excludes=["bias", "LayerNorm", "layer_norm"],
        clip_grad_max_norm=1,
        clip_grad_norm_type=2.0,
    )

    steps = args.epochs * len(train_data_loader)
    warmup_steps = int(steps * args.warmup_proportion)

    lr_scheduler = PolynomialLR(optimizer, steps=steps, end_learning_rate=0.0)

    lr_scheduler = flow.optim.lr_scheduler.WarmUpLR(lr_scheduler,
                                                    warmup_factor=0,
                                                    warmup_iters=warmup_steps,
                                                    warmup_method="linear")

    ns_criterion = nn.CrossEntropyLoss(reduction="mean")
    mlm_criterion = nn.CrossEntropyLoss(reduction="none")

    def get_masked_lm_loss(
        logit_blob,
        masked_lm_positions,
        masked_lm_labels,
        label_weights,
        max_prediction_per_seq,
    ):
        # gather valid position indices
        logit_blob = flow.gather(
            logit_blob,
            index=masked_lm_positions.unsqueeze(2).repeat(
                1, 1, args.vocab_size),
            dim=1,
        )

        logit_blob = flow.reshape(logit_blob, [-1, args.vocab_size])
        label_id_blob = flow.reshape(masked_lm_labels, [-1])

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        pre_example_loss = mlm_criterion(logit_blob, label_id_blob)
        pre_example_loss = flow.reshape(pre_example_loss,
                                        [-1, max_prediction_per_seq])
        numerator = flow.sum(pre_example_loss * label_weights)
        denominator = flow.sum(label_weights) + 1e-5
        loss = numerator / denominator
        return loss

    train_total_losses = []
    for epoch in range(args.epochs):
        metric = Metric(
            desc="bert pretrain",
            print_steps=args.loss_print_every_n_iters,
            batch_size=args.train_batch_size,
            keys=["total_loss", "mlm_loss", "nsp_loss", "pred_acc"],
        )

        # Train
        bert_model.train()

        for step in range(len(train_data_loader)):
            bert_outputs = pretrain(
                train_data_loader,
                bert_model,
                ns_criterion,
                partial(
                    get_masked_lm_loss,
                    max_prediction_per_seq=args.max_predictions_per_seq,
                ),
                optimizer,
                lr_scheduler,
            )

            if flow.env.get_rank() == 0:
                metric.metric_cb(step, epoch=epoch)(bert_outputs)

            train_total_losses.append(bert_outputs["total_loss"])

        # Eval
        bert_model.eval()
        val_acc = validation(epoch, test_data_loader, bert_model,
                             args.val_print_every_n_iters)

        save_model(bert_model, args.checkpoint_path, epoch, val_acc, False)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--bert_model",
        default='bert-base-multilingual-cased',
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    # parser.add_argument("--do_eval",
    #                     action='store_true',
    #                     help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=2,
                        type=int,
                        help="Total batch size for training.")
    #     parser.add_argument("--eval_batch_size",
    #                         default=2,
    #                         type=int,
    #                         help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on GPUs")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--visdom',
                        action='store_true',
                        help='Use visdom for loss visualization')
    parser.add_argument('--check_saved_model',
                        action='store_true',
                        help='Use visdom for loss visualization')
    parser.add_argument('--last_final_epoch',
                        type=int,
                        default=-1,
                        help="저번에 이미 최종 학습을 했고, 이에 이어서 트레이닝을 원할때 사용,\n"
                        "기존에 train_epoch를 3으로 세팅했다면, 2가 아닌 3을 입력하세요.")

    args = parser.parse_args()
    print(args)

    if args.visdom:
        import visdom
        viz = visdom.Visdom()
        # visdom을 통해서 loss를 시각화

    os.makedirs(args.output_dir, exist_ok=True)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=False)

    processor = DataProcessor()
    label_list = processor.get_labels()

    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.data_dir)

        train_examples = processor.get_train_examples(args.data_dir)
        train_dataset = LazyDataset(train_examples, args.max_seq_length,
                                    tokenizer)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            train_sampler = DistributedSampler(train_dataset)

        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    loaded_epoch = -1
    saved_model_path = -1

    if args.last_final_epoch != -1:
        last_model = os.path.join(args.output_dir, WEIGHTS_NAME)
        if os.path.exists(last_model):
            saved_model_path = last_model
            loaded_epoch = args.last_final_epoch - 1

    elif args.check_saved_model:
        for epoch in range(int(args.num_train_epochs)):
            tmp = os.path.join(args.output_dir,
                               (f"weight_on_ep{epoch}_" + WEIGHTS_NAME))
            if os.path.exists(tmp):
                saved_model_path = tmp
                loaded_epoch = epoch

    if saved_model_path != -1:
        logger.info(f"Loading on saved model {saved_model_path}")
        config_file = os.path.join(args.output_dir, CONFIG_NAME)
        config = BertConfig(config_file)
        logger.info("Model config {}".format(config))
        model = BertForPreTraining(config)
        model.load_state_dict(torch.load(saved_model_path))
    else:
        loaded_epoch = -1
        model = BertForPreTraining.from_pretrained(args.bert_model)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    if args.visdom:
        # 일단 visdom 기본 figure를 정의
        vis_title = f'Baseline on {len(train_dataset)} dataset'
        vis_legend = ['LM Loss', 'Click Loss', 'Total Loss']
        iter_plot = create_vis_plot(viz, 'Iteration', 'Loss', vis_title,
                                    vis_legend)
        epoch_plot = create_vis_plot(viz, 'Epoch', 'Loss', vis_title,
                                     vis_legend)

    # if args.do_eval:
    #     eval_examples = processor.get_dev_examples(args.data_dir)
    #
    #     logger.info("***** Running evaluation *****")
    #     logger.info("  Num examples = %d", len(eval_examples))
    #     logger.info("  Batch size = %d", args.eval_batch_size)
    #
    #     eval_data = LazyDatasetClassifier(eval_examples, label_list, args.max_seq_length, tokenizer)
    #     # Run prediction for full data
    #     """
    #     cur_tensors = (torch.tensor(f.input_ids),
    #            torch.tensor(f.input_mask),
    #            torch.tensor(f.segment_ids),
    #            torch.tensor(f.lm_label_ids),
    #            torch.tensor(f.label))
    #     """
    #     eval_sampler = SequentialSampler(eval_data)
    #     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
    #     save_eval_loss = []

    global_step = 0
    if args.do_train:

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        """
        cur_tensors = (torch.tensor(f.input_ids),
               torch.tensor(f.input_mask),
               torch.tensor(f.segment_ids),
               torch.tensor(f.lm_label_ids),
               torch.tensor(f.label))
        """

        save_loss = []
        save_epoch_loss = []
        save_step = int(len(train_dataloader) // 5)

        for epoch in trange((loaded_epoch + 1),
                            int(args.num_train_epochs),
                            desc="Epoch"):

            #     if args.do_eval and loaded_epoch != -1:
            #         model.eval()
            #         eval_loss, eval_accuracy = 0, 0
            #         nb_eval_steps, nb_eval_examples = 0, 0
            #
            #         for batch in tqdm(eval_dataloader, desc="Evaluating"):
            #             batch = tuple(t.to(device) for t in batch)
            #             input_ids, input_mask, segment_ids, label_ids = batch
            #
            #             with torch.no_grad():
            #                 tmp_eval_loss = model(input_ids, segment_ids, input_mask, None, label_ids)
            #                 prediction_scores, logits = model(input_ids, segment_ids, input_mask)
            #
            #             if n_gpu > 1:
            #                 tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu.
            #
            #             logits = logits.detach().cpu().numpy()
            #             label_ids = label_ids.to('cpu').numpy()
            #             tmp_eval_accuracy = accuracy(logits, label_ids)
            #
            #             eval_loss += tmp_eval_loss.mean().item()
            #             eval_accuracy += tmp_eval_accuracy
            #
            #             nb_eval_examples += input_ids.size(0)
            #             nb_eval_steps += 1
            #
            #         eval_loss = eval_loss / nb_eval_steps
            #         eval_accuracy = eval_accuracy / nb_eval_examples
            #         result = {'eval_loss': eval_loss,
            #                   'eval_accuracy': eval_accuracy,
            #                   'global_step': global_step}
            #
            #         save_eval_loss.append(eval_loss)
            #
            #         output_eval_file = os.path.join(args.output_dir, f"Epoch_{epoch}_eval_results.txt")
            #         with open(output_eval_file, "w") as writer:
            #             logger.info(f"***** Eval results on Epoch {epoch} *****")
            #             for key in sorted(result.keys()):
            #                 logger.info("  %s = %s", key, str(result[key]))
            #                 writer.write("%s = %s\n" % (key, str(result[key])))

            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            tr_loss_ml = 0
            tr_loss_click = 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, label = batch
                # if global_step == 0:
                #     print(input_ids.shape, input_mask.shape, segment_ids.shape, lm_label_ids.shape, label.shape)
                loss, loss_ml, loss_click = model(input_ids, segment_ids,
                                                  input_mask, lm_label_ids,
                                                  label)

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                    loss_ml = loss_ml.mean()
                    loss_click = loss_click.mean()

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                    loss_ml = loss_ml / args.gradient_accumulation_steps
                    loss_click = loss_click / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                tr_loss_ml += loss_ml.item()
                tr_loss_click += loss_click.item()

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if global_step != 0 and global_step % save_step == 0:
                    # 한 에포치당 5번 저장
                    logger.info(f'Saving state, iter: {global_step}')
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    # Only save the model it-self
                    model_name = f"weight_on_{global_step}_" + WEIGHTS_NAME
                    output_model_file = os.path.join(args.output_dir,
                                                     model_name)
                    torch.save(model_to_save.state_dict(), output_model_file)
                    output_config_file = os.path.join(args.output_dir,
                                                      CONFIG_NAME)
                    with open(output_config_file, 'w') as f:
                        f.write(model_to_save.config.to_json_string())
                    print("Loss at ", global_step, loss_ml.item(),
                          loss_click.item(), loss.item())

                save_loss.append(
                    [loss_ml.item(),
                     loss_click.item(),
                     loss.item()])

                if args.visdom:
                    update_vis_plot(viz, global_step, loss_ml.item(),
                                    loss_click.item(), iter_plot, epoch_plot,
                                    'append')

            if epoch != (int(args.num_train_epochs) - 1):
                # 각 에포치가 끝날때 마다 저장
                logger.info(f'Saving state, epoch: {epoch}')
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                # Only save the model it-self
                model_name = f"weight_on_ep{epoch}_" + WEIGHTS_NAME
                output_model_file = os.path.join(args.output_dir, model_name)
                torch.save(model_to_save.state_dict(), output_model_file)
                output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
                with open(output_config_file, 'w') as f:
                    f.write(model_to_save.config.to_json_string())
                print("Loss at epoch", epoch, tr_loss_ml, tr_loss_click,
                      tr_loss)

            save_epoch_loss.append([tr_loss_ml, tr_loss_click, tr_loss])
            if args.visdom:
                update_vis_plot(viz, epoch, tr_loss_ml, tr_loss_click,
                                epoch_plot, None, 'append',
                                len(train_dataset) // args.train_batch_size)

            # if args.do_eval and loaded_epoch == -1:
            #
            #     model.eval()
            #     eval_loss, eval_accuracy = 0, 0
            #     nb_eval_steps, nb_eval_examples = 0, 0
            #
            #     for batch in tqdm(eval_dataloader, desc="Evaluating"):
            #         batch = tuple(t.to(device) for t in batch)
            #         input_ids, input_mask, segment_ids, label_ids = batch
            #
            #         with torch.no_grad():
            #             tmp_eval_loss = model(input_ids, segment_ids, input_mask, None, label_ids)
            #             prediction_scores, logits = model(input_ids, segment_ids, input_mask)
            #
            #         if n_gpu > 1:
            #             tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu.
            #
            #         logits = logits.detach().cpu().numpy()
            #         label_ids = label_ids.to('cpu').numpy()
            #         tmp_eval_accuracy = accuracy(logits, label_ids)
            #
            #         eval_loss += tmp_eval_loss.mean().item()
            #         eval_accuracy += tmp_eval_accuracy
            #
            #         nb_eval_examples += input_ids.size(0)
            #         nb_eval_steps += 1
            #
            #     eval_loss = eval_loss / nb_eval_steps
            #     eval_accuracy = eval_accuracy / nb_eval_examples
            #     result = {'eval_loss': eval_loss,
            #               'eval_accuracy': eval_accuracy,
            #               'global_step': global_step}
            #
            #     save_eval_loss.append(eval_loss)
            #
            #     output_eval_file = os.path.join(args.output_dir, f"Epoch_{epoch}_eval_results.txt")
            #     with open(output_eval_file, "w") as writer:
            #         logger.info(f"***** Eval results on Epoch {epoch} *****")
            #         for key in sorted(result.keys()):
            #             logger.info("  %s = %s", key, str(result[key]))
            #             writer.write("%s = %s\n" % (key, str(result[key])))

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        # model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        # output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        # if args.do_train:
        #     torch.save(model_to_save.state_dict(), output_model_file)

        save_loss = np.array(save_loss)
        save_epoch_loss = np.array(save_epoch_loss)
        np.save(os.path.join(args.output_dir, "save_loss.npy"), save_loss)
        np.save(os.path.join(args.output_dir, "save_epoch_loss.npy"),
                save_epoch_loss)

        # if args.do_eval:
        #     save_eval_loss = np.array(save_eval_loss)
        #     np.save(os.path.join(args.output_dir, "save_eval_loss.npy"), save_eval_loss)

        model_to_save = model.module if hasattr(model, 'module') else model
        # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())
示例#17
0
            lazy_state_dict["bert-embeddings-token_type_embeddings"])
    eager_state_dict["bert.embeddings.position_embeddings.weight"].data.copy_(
        flow.tensor(lazy_state_dict["bert-embeddings-position_embeddings"].
                    numpy().squeeze(0)))


if __name__ == "__main__":
    lazy_model_path = "./of_bert_1000000_model_log/snapshot_snapshot_1000000"

    bert_module = BertForPreTraining(
        30522,
        128,
        768,
        12,
        12,
        3072,
        nn.GELU(),
        0.0,
        0.0,
        512,
        2,
    )

    load_params_from_lazy(bert_module.state_dict(), lazy_model_path)

    assert id(bert_module.cls.predictions.decoder.weight) == id(
        bert_module.bert.embeddings.word_embeddings.weight)

    with open(
            "../../OneFlow-Benchmark/LanguageModeling/BERT/lazy_input_output_1.pickle",
            "rb") as handle: