def prepare_model_optimizer(args):
    # Loading Model
    model = BertMultiTask(args)

    # Optimizer parameters
    optimizer_grouped_parameters = prepare_optimizer_parameters(args, model)

    # DeepSpeed initializer handles FP16, distributed, optimizer automatically.
    model.network, optimizer, _, _ = deepspeed.initialize(
        args=args,
        model=model.network,
        model_parameters=optimizer_grouped_parameters)

    # Overwrite application configs with DeepSpeed config
    args.train_micro_batch_size_per_gpu = model.network.train_micro_batch_size_per_gpu(
    )
    args.gradient_accumulation_steps = model.network.gradient_accumulation_steps(
    )

    # Set DeepSpeed info
    args.local_rank = model.network.local_rank
    args.device = model.network.device
    model.set_device(args.device)
    args.fp16 = model.network.fp16_enabled()
    args.use_lamb = model.network.optimizer_name(
    ) == deepspeed.pt.deepspeed_config.LAMB_OPTIMIZER

    # Prepare Summary Writer and saved_models path
    if dist.get_rank() == 0:
        summary_writer = get_sample_writer(name=args.job_name,
                                           base=args.output_dir)
        args.summary_writer = summary_writer
        os.makedirs(args.saved_model_path, exist_ok=True)

    return model, optimizer
Exemplo n.º 2
0
def prepare_model_optimizer(args):
    # Loading Model
    model = BertMultiTask(args)

    # Optimizer parameters
    optimizer_grouped_parameters = prepare_optimizer_parameters(args, model)

    # DeepSpeed initializer handles FP16, distributed, optimizer automatically.
    model.network, optimizer, _, _ = deepspeed.initialize(
        args=args,
        model=model.network,
        model_parameters=optimizer_grouped_parameters,
        dist_init_required=False)

    # Overwrite application configs with DeepSpeed config
    args.train_batch_size = model.network.train_micro_batch_size_per_gpu()
    args.gradient_accumulation_steps = model.network.gradient_accumulation_steps(
    )

    return model, optimizer
Exemplo n.º 3
0
def prepare_model_optimizer(args):
    # Loading Model
    model = BertMultiTask(args)

    if args.fp16:
        model.half()
    model.to(args.device)

    # Optimizer parameters
    optimizer_grouped_parameters = prepare_optimizer_parameters(args, model)

    # Prepare Optimizer
    config = args.config
    logger = args.logger
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer, FP16_UnfusedOptimizer, FusedAdam, FusedLamb
        except:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        if args.use_lamb:
            logger.info(
                "Using Lamb optimizer min_coeff={}, max_coeff={}".format(
                    args.min_lamb, args.max_lamb))
            optimizer = FusedLamb(optimizer_grouped_parameters,
                                  lr=config["training"]["learning_rate"],
                                  bias_correction=False,
                                  max_grad_norm=1.0,
                                  max_coeff=args.max_lamb,
                                  min_coeff=args.min_lamb)
        else:
            logger.info("Using adam optimizer")
            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=config["training"]["learning_rate"],
                                  bias_correction=False,
                                  max_grad_norm=1.0)
        logger.info(f"unwrapped optimizer_state = {optimizer.state_dict()}")
        if args.use_lamb:
            optimizer = FP16_UnfusedOptimizer(optimizer,
                                              dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
    else:
        optimizer = BertAdam(
            optimizer_grouped_parameters,
            lr=config["training"]["learning_rate"],
            warmup=config["training"]["warmup_proportion"],
            t_total=config["training"]["total_training_steps"])
    if args.local_rank != -1:
        try:
            logger.info(
                "***** Using Default Apex Distributed Data Parallel *****")
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        torch.cuda.set_device(args.local_rank)
        model.network = DDP(model.network,
                            delay_allreduce=args.delay_allreduce,
                            message_size=250000000)
    elif args.n_gpu > 1:
        model.network = DDP(model.network,
                            delay_allreduce=args.delay_allreduce,
                            message_size=250000000)
    return model, optimizer