예제 #1
0
        def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
            """Initialize deepspeed.comm and execute the user function. """
            os.environ['MASTER_ADDR'] = '127.0.0.1'
            os.environ['MASTER_PORT'] = get_master_port()
            os.environ['LOCAL_RANK'] = str(local_rank)
            # NOTE: unit tests don't support multi-node so local_rank == global rank
            os.environ['RANK'] = str(local_rank)
            os.environ['WORLD_SIZE'] = str(num_procs)

            # turn off NCCL logging if set
            os.environ.pop('NCCL_DEBUG', None)

            set_cuda_visibile()

            deepspeed.init_distributed(dist_backend=backend)
            #dist.init_process_group(backend=backend)
            dist.barrier()

            if torch.cuda.is_available():
                torch.cuda.set_device(local_rank)

            run_func(*func_args, **func_kwargs)

            # make sure all ranks finish at the same time
            dist.barrier()
            # tear down after test completes
            dist.destroy_process_group()
예제 #2
0
def init_deepspeed_comm(backend):
    global dist
    import deepspeed
    import deepspeed.comm as dist
    deepspeed.init_distributed(dist_backend=backend)
    local_rank = int(os.environ['LOCAL_RANK'])
    torch.cuda.set_device(local_rank)
예제 #3
0
 def __init__(self, context: det_ds.DeepSpeedTrialContext):
     self.context = context
     self.hparams = attrdict.AttrDict(context.get_hparams())
     if (self.hparams.test_manual_init_distributed
             or self.hparams.test_fail_manual_init_distributed):
         assert (not torch.distributed.is_initialized()
                 ), "distributed backend should not be initialized"
     if (self.hparams.test_manual_init_distributed
             and not self.hparams.test_fail_manual_init_distributed):
         deepspeed.init_distributed(auto_mpi_discovery=False)
     if self.hparams.test_manual_grad_acc or self.hparams.test_fail_manual_grad_acc:
         self.context.disable_auto_grad_accumulation()
     if self.hparams.test_manual_dataloader:
         self.context.disable_dataset_reproducibility_checks()
     self.ds_config = attrdict.AttrDict(self.hparams.deepspeed_config)
     model = torch.nn.Linear(1, 1)
     self.model, optimizer, _, _ = deepspeed.initialize(
         model=model,
         config=self.ds_config,
         model_parameters=model.parameters(),
         dist_init_required=False,
     )
     self.model = self.context.wrap_model_engine(self.model)
     self.loss = torch.nn.MSELoss()
     self.reducer = None
     if self.hparams.test_custom_reducer:
         self.reducer = self.context.wrap_reducer(lambda x: np.mean(x) * 2,
                                                  name="loss_2x")
예제 #4
0
def initialize_distributed(args):
    """Initialize torch.distributed."""

    if args.deepspeed:
        deepspeed.init_distributed(dist_backend=args.distributed_backend)
    else:
        # Manually set the device ids.
        device = args.rank % torch.cuda.device_count()
        # Call the init process
        init_method = 'tcp://'
        master_ip = os.getenv('MASTER_ADDR', 'localhost')
        master_port = os.getenv('MASTER_PORT', '6000')
        init_method += master_ip + ':' + master_port
        torch.distributed.init_process_group(
            backend=args.distributed_backend,
            world_size=args.world_size, rank=args.rank,
            init_method=init_method)

    if args.local_rank is not None:
        device = args.local_rank
    torch.cuda.set_device(device)

    # Set the model-parallel / data-parallel communicators.
    mpu.initialize_model_parallel(args.model_parallel_size)

    # Optional DeepSpeed Activation Checkpointing Features
    #
    if args.deepspeed and args.deepspeed_activation_checkpointing:
        set_deepspeed_activation_checkpointing(args)
예제 #5
0
    def _dist_init(self, local_rank, num_procs, skip_msg):
        """Initialize deepspeed.comm and execute the user function. """
        if self.set_dist_env:
            os.environ['MASTER_ADDR'] = '127.0.0.1'
            os.environ['MASTER_PORT'] = get_master_port()
            os.environ['LOCAL_RANK'] = str(local_rank)
            # NOTE: unit tests don't support multi-node so local_rank == global rank
            os.environ['RANK'] = str(local_rank)
            os.environ['WORLD_SIZE'] = str(num_procs)

        # turn off NCCL logging if set
        os.environ.pop('NCCL_DEBUG', None)

        set_cuda_visibile()

        if self.init_distributed:
            deepspeed.init_distributed(dist_backend=self.backend)
            dist.barrier()

        if torch.cuda.is_available():
            torch.cuda.set_device(local_rank)

        try:
            self.current_test(**self.test_kwargs)
        except BaseException as e:
            if isinstance(e, Skipped):
                skip_msg.put(e.msg)
            else:
                raise e

        if self.init_distributed or dist.is_initialized():
            # make sure all ranks finish at the same time
            dist.barrier()
            # tear down after test completes
            dist.destroy_process_group()
예제 #6
0
    def _setup_devices(self) -> "torch.device":
        logger.info("PyTorch: setting up devices")
        if self.no_cuda:
            device = torch.device("cpu")
            self._n_gpu = 0
        elif is_torch_tpu_available():
            device = xm.xla_device()
            self._n_gpu = 0
        elif is_sagemaker_mp_enabled():
            local_rank = smp.local_rank()
            device = torch.device("cuda", local_rank)
            self._n_gpu = 1
        elif is_sagemaker_dp_enabled():
            sm_dist.init_process_group()
            self.local_rank = sm_dist.get_local_rank()
            device = torch.device("cuda", self.local_rank)
            self._n_gpu = 1
        elif self.deepspeed:
            # deepspeed performs its own DDP internally, and requires the program to be started with:
            # deepspeed  ./program.py
            # rather than:
            # python -m torch.distributed.launch --nproc_per_node=2 ./program.py
            from .integrations import is_deepspeed_available

            if not is_deepspeed_available():
                raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.")
            import deepspeed

            deepspeed.init_distributed()

            # workaround for setups like notebooks where the launcher can't be used,
            # but deepspeed requires a dist env.
            # env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed
            self.local_rank = int(os.environ.get("LOCAL_RANK", "-1"))

            device = torch.device("cuda", self.local_rank)
            self._n_gpu = 1
        elif self.local_rank == -1:
            # if n_gpu is > 1 we'll use nn.DataParallel.
            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
            # trigger an error that a device index is missing. Index 0 takes into account the
            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
            # will use the first GPU in that env, i.e. GPU#1
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
            # the default value.
            self._n_gpu = torch.cuda.device_count()
        else:
            # Here, we'll use torch.distributed.
            # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
            torch.distributed.init_process_group(backend="nccl")
            device = torch.device("cuda", self.local_rank)
            self._n_gpu = 1

        if device.type == "cuda":
            torch.cuda.set_device(device)

        return device
예제 #7
0
def init_deepspeed(do_init):
    """Initialize the DeepSpeed distributed backend."""
    global using_deepspeed
    using_deepspeed = do_init

    if not do_init:
        return
    deepspeed.init_distributed()
예제 #8
0
 def test(self):
     torch.distributed.init_process_group(
         backend='nccl',
         init_method=f"tcp://127.0.0.1:{get_master_port()}",
         world_size=1,
         rank=0)
     assert torch.distributed.is_initialized()
     deepspeed.init_distributed('nccl', auto_mpi_discovery=True)
예제 #9
0
 def test_no_init(self, dist_init_required):
     if dist_init_required or dist_init_required is None:
         deepspeed.init_distributed('nccl',
                                    dist_init_required=dist_init_required)
     else:
         # torch.dist is not done and for some reason the user says they don't want it done
         with pytest.raises(Exception):
             deepspeed.init_distributed(
                 'nccl', dist_init_required=dist_init_required)
예제 #10
0
 def _init_deepspeed_distributed(self) -> None:
     if platform.system() != "Windows":
         # do not set env variables on windows, allow deepspeed to control setup
         self._set_node_environment_variables()
         log.info(
             "initializing deepspeed distributed: "
             f"GLOBAL_RANK: {self.global_rank}, "
             f"MEMBER: {self.global_rank + 1}/{self.world_size}"
         )
     deepspeed.init_distributed(self.torch_distributed_backend, distributed_port=self.cluster_environment.main_port)
예제 #11
0
    def _setup_devices(self) -> Tuple["torch.device", int]:
        logger.info("PyTorch: setting up devices")
        if self.no_cuda:
            device = torch.device("cpu")
            n_gpu = 0
        elif is_torch_tpu_available():
            device = xm.xla_device()
            n_gpu = 0
        elif self.local_rank == -1:
            # if n_gpu is > 1 we'll use nn.DataParallel.
            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
            # trigger an error that a device index is missing. Index 0 takes into account the
            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
            # will use the first GPU in that env, i.e. GPU#1
            device = torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu")
            # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
            # the default value.
            if self._n_gpu == -1:
                self._n_gpu = torch.cuda.device_count()
            n_gpu = self._n_gpu
        else:
            # Here, we'll use torch.distributed.
            # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
            #
            # deepspeed performs its own DDP internally, and requires the program to be started with:
            # deepspeed  ./program.py
            # rather than:
            # python -m torch.distributed.launch --nproc_per_node=2 ./program.py
            if self.deepspeed:
                from .integrations import is_deepspeed_available

                if not is_deepspeed_available():
                    raise ImportError(
                        "--deepspeed requires deepspeed: `pip install deepspeed`."
                    )
                import deepspeed

                deepspeed.init_distributed()
            else:
                torch.distributed.init_process_group(backend="nccl")
            device = torch.device("cuda", self.local_rank)
            n_gpu = 1

        if device.type == "cuda":
            torch.cuda.set_device(device)

        return device, n_gpu
예제 #12
0
        def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
            """Initialize torch.distributed and execute the user function. """
            os.environ['MASTER_ADDR'] = '127.0.0.1'
            os.environ['MASTER_PORT'] = '29503'
            os.environ['LOCAL_RANK'] = str(local_rank)
            # NOTE: unit tests don't support multi-node so local_rank == global rank
            os.environ['RANK'] = str(local_rank)
            os.environ['WORLD_SIZE'] = str(num_procs)

            deepspeed.init_distributed(dist_backend=backend)

            if torch.cuda.is_available():
                torch.cuda.set_device(local_rank)

            run_func(*func_args, **func_kwargs)
예제 #13
0
 def init_ddp_connection(self,
                         global_rank: Optional[int] = None,
                         world_size: Optional[int] = None) -> None:
     if platform.system() != "Windows":
         # do not set env variables on windows, allow deepspeed to control setup
         global_rank = global_rank if global_rank is not None else self.cluster_environment.global_rank(
         )
         world_size = world_size if world_size is not None else self.cluster_environment.world_size(
         )
         self._set_node_environment_variables(global_rank, world_size)
         log.info("initializing deepspeed distributed: "
                  f"GLOBAL_RANK: {global_rank}, "
                  f"MEMBER: {global_rank + 1}/{world_size}")
     deepspeed.init_distributed(
         self.torch_distributed_backend,
         distributed_port=self.cluster_environment.master_port())
예제 #14
0
    def prepare_model_optimizer(self, model):
        # Initialize torch distributed
        deepspeed.init_distributed(dist_backend="nccl")

        # FIXME
        from dataclasses import dataclass

        @dataclass
        class TmpClass:
            local_rank: int

        fake_arg = TmpClass(self.fs_args.device_id)
        # DeepSpeed initializer handles FP16, distributed, optimizer automatically.
        self.model, self.optimizer, _, _ = deepspeed.initialize(
            args=fake_arg,
            model=model,
            model_parameters=model.parameters(),
            config_params=self.ds_config,
        )
예제 #15
0
    def setup_process(self, rank: int = -1, world_size: int = 1):
        """Initialize DDP variables and processes.

        Args:
            rank: process rank. Default is `-1`.
            world_size: number of devices in netwok to expect for train.
                Default is `1`.
        """
        self._rank = rank
        self._world_size = world_size
        torch.cuda.set_device(int(self._rank))
        self._device = f"cuda:{int(self._rank)}"

        os.environ["RANK"] = str(rank)
        os.environ["LOCAL_RANK"] = str(rank)
        os.environ["WORLD_SIZE"] = str(world_size)
        os.environ["MASTER_ADDR"] = str(self.address)
        os.environ["MASTER_PORT"] = str(self.port)
        deepspeed.init_distributed(**self.process_group_kwargs)
예제 #16
0
def prepare_model_optimizer(args):
    # Initialize torch distributed
    deepspeed.init_distributed(dist_backend='nccl')
    args.local_rank = int(os.environ['LOCAL_RANK'])

    # Loading Model
    model = BertMultiTask(args)

    # Optimizer parameters
    optimizer_grouped_parameters = prepare_optimizer_parameters(args, model)

    # DeepSpeed initializer handles FP16, distributed, optimizer automatically.
    model.network, optimizer, _, _ = deepspeed.initialize(
        args=args,
        model=model.network,
        model_parameters=optimizer_grouped_parameters)

    # Overwrite application configs with DeepSpeed config
    args.train_micro_batch_size_per_gpu = model.network.train_micro_batch_size_per_gpu(
    )
    args.gradient_accumulation_steps = model.network.gradient_accumulation_steps(
    )

    # Set DeepSpeed info
    args.local_rank = model.network.local_rank
    args.device = model.network.device
    model.set_device(args.device)
    args.fp16 = model.network.fp16_enabled()
    args.use_lamb = (model.network.optimizer_name() ==
                     deepspeed.runtime.config.LAMB_OPTIMIZER
                     or model.network.optimizer_name() ==
                     deepspeed.runtime.config.ONEBIT_LAMB_OPTIMIZER)

    # Prepare Summary Writer and saved_models path
    if dist.get_rank() == 0:
        summary_writer = get_sample_writer(name=args.job_name,
                                           base=args.output_dir)
        args.summary_writer = summary_writer
        os.makedirs(args.saved_model_path, exist_ok=True)

    return model, optimizer
예제 #17
0
    def pre_execute_hook(
        cls: Type["DeepSpeedTrialController"],
        env: det.EnvContext,
        distributed_backend: det._DistributedBackend,
    ) -> None:
        # We use an environment variable to allow users to enable custom initialization routine for
        # distributed training since the pre_execute_hook runs before trial initialization.
        manual_dist_init = os.environ.get("DET_MANUAL_INIT_DISTRIBUTED")
        if not manual_dist_init:
            # DeepSpeed's init_distributed handles situations in which only 1 gpu is used and
            # also handles multiple calls to init in one process.
            deepspeed.init_distributed(auto_mpi_discovery=False)

        # Set identical random seeds on all training processes.
        # When data parallel world size > 1, each data parallel rank will start at a unique
        # offset in the dataset, ensuring it's processing a unique
        # training batch.
        # TODO (Liam): seed data loading workers so that we can configure different seeds for
        # data augmentations per slot per worker.
        random.seed(env.trial_seed)
        np.random.seed(env.trial_seed)
        torch.random.manual_seed(env.trial_seed)
예제 #18
0
        def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
            """Initialize torch.distributed and execute the user function."""
            os.environ["MASTER_ADDR"] = "127.0.0.1"
            os.environ["MASTER_PORT"] = get_master_port()
            os.environ["LOCAL_RANK"] = str(local_rank)
            # NOTE: unit tests don't support multi-node so local_rank == global rank
            os.environ["RANK"] = str(local_rank)
            os.environ["WORLD_SIZE"] = str(num_procs)

            # turn off NCCL logging if set
            os.environ.pop("NCCL_DEBUG", None)

            deepspeed.init_distributed(dist_backend=backend)

            if torch.cuda.is_available():
                torch.cuda.set_device(local_rank)

            run_func(*func_args, **func_kwargs)

            # make sure all ranks finish at the same time
            torch.distributed.barrier()
            # tear down after test completes
            torch.distributed.destroy_process_group()
예제 #19
0
def train(local_rank, args):
    # torch.multiprocessing.set_sharing_strategy('file_system')
    # too many barriers / one node data parallel and multiple node DDP
    os.environ['MASTER_ADDR'] = args["master_addr"]
    os.environ['MASTER_PORT'] = args["master_port"]
    os.environ["NCCL_DEBUG"] = "WARN"
    # os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank)
    # gpu_device = 0
    gpu_device = local_rank
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    if args["wandb_dryrun"]:
        os.environ["WANDB_MODE"] = "dryrun"
        os.environ["WANDB_SILENT"] = "true"
    os.environ['TOKENIZERS_PARALLELISM'] = "true"
    torch.backends.cudnn.benchmark = True
    rank = args["nr"] if args["cpu"] else (args["nr"] * args["gpus_per_node"] +
                                           local_rank)
    nr = args["nr"]
    device = torch.device(
        f'cuda:{gpu_device}')  # Unique only on individual node.
    torch.cuda.set_device(device)
    print("[Train]: Time = %s, Prepare to init Dist Process for Rank = %s" %
          (get_time_string(), rank))

    if args["nr"] == 0:
        args["master_addr"] = "0.0.0.0"
    init_method = "tcp://%s:%s" % (args["master_addr"], args["master_port"])

    deepspeed.init_distributed(distributed_port=int(args["master_port"]),
                               init_method=init_method)

    format = "%Y-%m-%d %H-%M %Z"
    # + timedelta(hours=5, minutes=30)
    time_string = (datetime.fromtimestamp(
        time.mktime(time.gmtime(rnd.cpu().item())))).astimezone(
            timezone('Asia/Kolkata')).strftime(format)
    ds_name = list(
        filter(lambda x: len(x.strip()) > 0,
               args["train_dataset"].split("/")))[-1].replace(
                   "train_fastformer_resampled_", "")
    group = "%s-%s-nodes-%s" % (ds_name, args["nodes"], time_string)
    set_seeds(args["seed"])
    mconf = model_config.to_dict()
    config = dict(md_config=md_config,
                  sm_config=sm_config)[mconf.pop("model_size")]
    tokenizer = get_tokenizer(mconf.pop("tokenizer_name"))
    config.vocab_size = len(tokenizer) + 22
    config.tokenizer_length = 1024
    config.tokenizer_length = config.tokenizer_length - config.num_highway_cls_tokens
    config.max_position_embeddings = config.max_position_embeddings + config.num_highway_cls_tokens

    collate_fn = get_collate_fn(config.num_highway_cls_tokens,
                                tokenizer.pad_token_id)

    model = FastFormerForFusedELECTRAPretraining(config,
                                                 tokenizer=tokenizer,
                                                 **mconf).to(device)
    print("[Train]: Trainable Params = %s" % (numel(model) / 1_000_000))
    if args["pretrained_model"] is not None and os.path.exists(
            args["pretrained_model"]) and rank == 0:
        model.load_state_dict(
            torch.load(args["pretrained_model"],
                       map_location='cuda:%d' % gpu_device))

    model_engine, optimizer, _, _ = deepspeed.initialize(
        args=args, model=model, model_parameters=model.parameters())

    model_save_dir = args["model_save_dir"]
    model_save_name = args["model_save_name"]
    if local_rank == 0:
        if not os.path.exists(model_save_dir):
            os.makedirs(model_save_dir)
        assert os.path.exists(model_save_dir)
    shuffle_dataset = args["shuffle_dataset"]

    train_loader = build_dataloader(args["train_dataset"],
                                    shuffle_dataset,
                                    sampling_fraction,
                                    config,
                                    collate_fn,
                                    tokenizer,
                                    world_size=args["world_size"],
                                    num_workers=args["num_workers"],
                                    no_autocast=args["no_autocast"])

    print("[Train]: Data Loaded for Rank = %s" % rank)

    log_every_steps = args["log_every_steps"]
    save_every_steps = args["save_every_steps"]
    print("[Train]: Scheduler Created for Rank = %s" % rank)
    other_load_details = None
    if "resume" in args and isinstance(
            args["resume"], str) and len(args["resume"].strip()) > 0:
        _, other_load_details = model_engine.load_checkpoint(
            model_save_dir, args["resume"])
        step = other_load_details['step']

    else:
        print("[Train]: No Resume for Rank = %s" % rank)
    _ = model.train()

    # print("[Train]: Init Wandb-watch added over model for Rank = %s" % rank)
    # wandb.watch(model, log="all", log_freq=log_every_steps)
    print("[Train]: WandB-watch added over model for Rank = %s" % rank)
    batch_times = []
    model_times = []
    full_times = []
    samples_processed = 0
    samples_processed_this_log_iter = 0
    print("[Train]: Time = %s, Start Training for Rank = %s" %
          (get_time_string(), rank))
    if local_rank == 0:
        wandb_init_args = dict(project="fastformer",
                               name="%s-%s-%s-%s" %
                               (group, args["nr"], rank, local_rank),
                               group=group,
                               id=f"{group}-worker-{nr}-{rank}-{local_rank}",
                               config={
                                   "args": args,
                                   "model_config": mconf,
                                   "config": config,
                                   "optimizer_config": optc
                               },
                               settings=wandb.Settings(start_method="fork"))

        time.sleep(random.random() * 5)
        wandb.init(**wandb_init_args)

    if args["detect_anomaly"]:
        torch.autograd.set_detect_anomaly(True)

    def get_hook(name_of_param=None):
        if name_of_param is None:

            def hook(grad):
                is_nan_inf = torch.logical_not(torch.isfinite(grad))
                if is_nan_inf.any():
                    grad = torch.where(
                        is_nan_inf,
                        torch.sign(grad) * torch.empty_like(grad).fill_(1e-3),
                        grad)
                    # grad = torch.clamp_(grad, -1e1, 1e1)
                    return grad
                else:
                    return None

            return hook
        else:

            def named_hook(grad):
                is_nan_inf = torch.logical_not(torch.isfinite(grad))
                if is_nan_inf.any():
                    print(
                        "[GRAD-HOOK]: Time = %s, Param Name = %s, Detected Nan/Inf"
                        % (get_time_string(), name_of_param))
                    grad = torch.where(
                        is_nan_inf,
                        torch.sign(grad) * torch.empty_like(grad).fill_(1e-3),
                        grad)
                    # grad = torch.clamp_(grad, -1e1, 1e1)
                    return grad
                else:
                    return None

            return named_hook

    if args["backward_hook"]:
        for name, param in model.named_parameters():
            if "embeddings" in name or "sent_predict_fc" in name or "embed_proj_transpose" in name or "embed_proj" in name or "lm_head" in name or "contrastive_ffn" in name or "encoder.blocks.0" in name:  #
                param.register_hook(get_hook())
            else:
                param.register_hook(get_hook())

    start_time = time.time()
    for step, batch in enumerate(train_loader):
        gen_batch_time = time.time() - start_time
        batch_times.append(gen_batch_time)
        bs_size = list(batch["input_ids"].size())
        batch = {
            k: v.to(device, non_blocking=True) if hasattr(v, "to") else v
            for k, v in batch.items()
        }
        electra_loss_w = float(
            ((step + 1) / (2 * 10000)) * mconf["electra_loss_w"])
        model_engine.model.electra_loss_w = electra_loss_w

        if (step + 1) % save_every_steps == 0:
            client_sd = dict()
            client_sd['step'] = step
            ckpt_id = step
            model_engine.save_checkpoint(model_save_dir,
                                         ckpt_id,
                                         client_sd=client_sd)

        record_accuracy = False
        if (step + 1) % log_every_steps == 0:
            if local_rank == 0:
                record_accuracy = True

        batch["record_accuracy"] = record_accuracy
        labels = batch[
            "label_mlm_input_ids"] if "label_mlm_input_ids" in batch else batch[
                "input_ids"]
        labels = labels.to(device, non_blocking=True)
        model_start_time = time.time()
        samples_processed += int(batch["input_ids"].size(0))
        samples_processed_this_log_iter += int(batch["input_ids"].size(0))
        # clean_memory()
        # print("Step = %s, Before:, for Rank = %s, input_size = %s, Allocated = %.3f, Max Allocated = %.3f, Percent = %s" %
        #       (step, rank, batch["input_ids"].size(), torch.cuda.memory_allocated() / 1e6, torch.cuda.max_memory_allocated() /1e6, torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated()))  # torch.cuda.memory_summary()

        # forward() method
        output = model_engine(**batch, labels=labels)
        loss = output["loss"]
        # runs backpropagation
        model_engine.backward(loss)

        # weight update
        model_engine.step()

        # clean_memory()
        # print("Step = %s, After: , for Rank = %s, input_size = %s, Allocated = %.3f, Max Allocated = %.3f, Percent = %s" %
        #       (step, rank, batch["input_ids"].size(), torch.cuda.memory_allocated() / 1e6, torch.cuda.max_memory_allocated() / 1e6,
        #        torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated()))  # torch.cuda.memory_summary()

        model_end_time = time.time() - model_start_time
        model_times.append(model_end_time)
        full_time = time.time() - start_time
        full_times.append(full_time)
        if step == 0:
            print("[Train]: Time = %s, First Batch Training for Rank = %s" %
                  (get_time_string(), rank))
        if (step + 1) % log_every_steps == 0:
            if local_rank == 0:
                samples_per_second = samples_processed_this_log_iter / np.sum(
                    full_times)
                acc_dict = output["accuracy_hist"]
                loss_dict = output["loss_dict"]
                time.sleep(random.random() + 0.1)
                wandb.log(
                    dict(lr=optimizer.param_groups[0]['lr'],
                         step=step,
                         samples_processed=samples_processed,
                         samples_per_second=samples_per_second,
                         batch_x_sequence=np.prod(bs_size[:2]),
                         batch_times=np.mean(batch_times),
                         model_times=np.mean(model_times),
                         full_times=np.mean(full_times),
                         scale=scaler.get_scale(),
                         **loss_dict,
                         **acc_dict))
                print(
                    "[Train]: Time = %s, Rank = %s, steps = %s, samples_processed=%s, batch_size = %s, Loss = %s, Accuracy = %s, LR = %s"
                    % (get_time_string(), rank, step, samples_processed,
                       bs_size, loss_dict, output["accuracy_hist"],
                       optimizer.param_groups[0]['lr']))
                print(
                    "[Train-Timings]: Time = %s, Batch time = %.4f, Model Time = %.4f, Full time = %.4f, samples_per_second = %s"
                    % (get_time_string(), np.mean(batch_times),
                       np.mean(model_times), np.mean(full_times),
                       samples_per_second))
                del acc_dict
                del loss_dict

            batch_times = []
            model_times = []
            full_times = []
            samples_processed_this_log_iter = 0

        del batch
        del labels
        del output
        del bs_size
        start_time = time.time()
예제 #20
0
def worker(proc_id, gpu_ranks, args, model):
    """
    Args:
        proc_id: The id of GPU for single GPU mode;
                 The id of process (and GPU) for multiprocessing distributed mode.
        gpu_ranks: List of ranks of each process.
    """
    set_seed(args.seed)

    # Get logger
    args.logger = init_logger(args)

    if args.deepspeed:
        import deepspeed
        deepspeed.init_distributed(dist_backend=args.backend)
        rank = dist.get_rank()
        gpu_id = proc_id
    elif args.dist_train:
        rank = gpu_ranks[proc_id]
        gpu_id = proc_id
    elif args.single_gpu:
        rank = None
        gpu_id = proc_id
    else:
        rank = None
        gpu_id = None

    if args.dist_train:
        train_loader = str2dataloader[args.data_processor](
            args, args.dataset_path, args.batch_size, rank, args.world_size,
            True)
    else:
        train_loader = str2dataloader[args.data_processor](args,
                                                           args.dataset_path,
                                                           args.batch_size, 0,
                                                           1, True)

    # Build optimizer.
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "gamma", "beta"]
    optimizer_grouped_parameters = [{
        "params":
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay":
        0.01
    }, {
        "params":
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay":
        0.0
    }]

    if args.optimizer in ["adamw"]:
        custom_optimizer = str2optimizer[args.optimizer](
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            correct_bias=False)
    else:
        custom_optimizer = str2optimizer[args.optimizer](
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            scale_parameter=False,
            relative_step=False)
    if args.scheduler in ["constant"]:
        custom_scheduler = str2scheduler[args.scheduler](custom_optimizer)
    elif args.scheduler in ["constant_with_warmup"]:
        custom_scheduler = str2scheduler[args.scheduler](
            custom_optimizer, args.total_steps * args.warmup)
    else:
        custom_scheduler = str2scheduler[args.scheduler](
            custom_optimizer, args.total_steps * args.warmup, args.total_steps)

    if args.deepspeed:
        model, optimizer, _, scheduler = deepspeed.initialize(
            model=model,
            model_parameters=optimizer_grouped_parameters,
            args=args,
            optimizer=custom_optimizer,
            lr_scheduler=custom_scheduler,
            mpu=None,
            dist_init_required=False)
    else:
        if gpu_id is not None:
            model.cuda(gpu_id)
        optimizer = custom_optimizer
        scheduler = custom_scheduler
        if args.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args.fp16_opt_level)
            args.amp = amp

        if args.dist_train:
            # Initialize multiprocessing distributed training environment.
            dist.init_process_group(backend=args.backend,
                                    init_method=args.master_ip,
                                    world_size=args.world_size,
                                    rank=rank)
            model = DistributedDataParallel(model,
                                            device_ids=[gpu_id],
                                            find_unused_parameters=True)
            args.logger.info("Worker %d is training ... " % rank)
        else:
            args.logger.info("Worker is training ...")

    trainer = str2trainer[args.data_processor](args)
    trainer.train(args, gpu_id, rank, train_loader, model, optimizer,
                  scheduler)
예제 #21
0
 def test_already_init(self, dist_init_required):
     torch.distributed.init_process_group('nccl')
     deepspeed.init_distributed('nccl',
                                dist_init_required=dist_init_required)
예제 #22
0
import gpt_neox

WORLD_SIZE = os.getenv('WORLD_SIZE')

# arguments
train_args = get_args()
params = get_params(train_args.model)

# tokenizer
tokenizer = get_tokenizer(tokenizer_type=params["tokenizer"].get("type", None),
                          from_pretrained=params["tokenizer"].get("from_pretrained", True),
                          add_padding_token=params["tokenizer"].get("add_padding_token", False))
vocab_size = len(tokenizer) if params["vocab_size"] is None else params["vocab_size"]

# model
deepspeed.init_distributed(dist_backend='nccl')
torch.distributed.barrier()  # barrier will force processes to stop until *all* processes have reached the barrier

def loss_function(x, y):
    losses = torch.nn.functional.cross_entropy(x, y, reduction='none')
    loss = losses.mean()
    return loss
        
model = GPTNeoX_Pipe(
    num_tokens=params["vocab_size"],
    dim=params["hidden_dim"],
    seq_len=params["seq_len"],
    depth=params["n_layers"],
    heads=params["n_heads"],
    dim_head=params["dim_head"],
    loss_fn = loss_function,#torch.nn.CrossEntropyLoss(),
예제 #23
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--labels_num", type=int, required=True,
                        help="Number of prediction labels.")

    tokenizer_opts(parser)

    parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.")
    parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.")

    deepspeed_opts(parser)
    parser.add_argument("--mp_size", type=int, default=1, help="Model parallel size.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model and load parameters.
    args.soft_targets, args.soft_alpha = False, False
    deepspeed.init_distributed()
    model = Classifier(args)

    if args.load_model_path:
        model = load_model(model, args.load_model_path)

    model = deepspeed.init_inference(model=model, mp_size=args.mp_size, replace_method=None)

    rank = dist.get_rank()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if rank == 0:
        dataset = read_dataset(args, args.test_path)

        src = torch.LongTensor([sample[0] for sample in dataset])
        seg = torch.LongTensor([sample[1] for sample in dataset])

        batch_size = args.batch_size
        instances_num = src.size()[0]

        print("The number of prediction instances: ", instances_num)

        model.eval()

        with open(args.prediction_path, mode="w", encoding="utf-8") as f:
            f.write("label")
            if args.output_logits:
                f.write("\t" + "logits")
            if args.output_prob:
                f.write("\t" + "prob")
            f.write("\n")
            for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
                src_batch = src_batch.to(device)
                seg_batch = seg_batch.to(device)
                with torch.no_grad():
                    _, logits = model(src_batch, None, seg_batch)

                pred = torch.argmax(logits, dim=1)
                pred = pred.cpu().numpy().tolist()
                prob = nn.Softmax(dim=1)(logits)
                logits = logits.cpu().numpy().tolist()
                prob = prob.cpu().numpy().tolist()

                for j in range(len(pred)):
                    f.write(str(pred[j]))
                    if args.output_logits:
                        f.write("\t" + " ".join([str(v) for v in logits[j]]))
                    if args.output_prob:
                        f.write("\t" + " ".join([str(v) for v in prob[j]]))
                    f.write("\n")
예제 #24
0
def main():
    parser = get_argument_parser()

    deepspeed.init_distributed(dist_backend='nccl')

    # Include DeepSpeed configuration arguments
    parser = deepspeed.add_config_arguments(parser)

    args = parser.parse_args()
    args.local_rank = int(os.environ['LOCAL_RANK'])
    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = read_squad_examples(input_file=args.train_file,
                                             is_training=True)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    # model = BertForQuestionAnswering.from_pretrained(args.bert_model,
    #            cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))

    # Support for word embedding padding checkpoints
    # Prepare model

    bert_model_config = {
        "vocab_size_or_config_json_file": 119547,
        "hidden_size": 1024,
        "num_hidden_layers": 24,
        "num_attention_heads": 16,
        "intermediate_size": 4096,
        "hidden_act": "gelu",
        "hidden_dropout_prob": args.dropout,
        "attention_probs_dropout_prob": args.dropout,
        "hidden_dropout_prob": 0.1,
        "attention_probs_dropout_prob": 0.1,
        "max_position_embeddings": 512,
        "type_vocab_size": 2,
        "initializer_range": 0.02
    }

    if args.ckpt_type == "DS":
        if args.preln:
            bert_config = BertConfigPreLN(**bert_model_config)
        else:
            bert_config = BertConfig(**bert_model_config)
    else:
        # Models from Tensorflow and Huggingface are post-LN.
        if args.preln:
            raise ValueError(
                "Should NOT use --preln if the loading checkpoint doesn't use pre-layer-norm."
            )

        # Use the original bert config if want to load from non-DeepSpeed checkpoint.
        if args.origin_bert_config_file is None:
            raise ValueError(
                "--origin_bert_config_file is required for loading non-DeepSpeed checkpoint."
            )

        bert_config = BertConfig.from_json_file(args.origin_bert_config_file)

        if bert_config.vocab_size != len(tokenizer.vocab):
            raise ValueError("vocab size from original checkpoint mismatch.")

    bert_config.vocab_size = len(tokenizer.vocab)
    # Padding for divisibility by 8
    if bert_config.vocab_size % 8 != 0:
        vocab_diff = 8 - (bert_config.vocab_size % 8)
        bert_config.vocab_size += vocab_diff

    if args.preln:
        model = BertForQuestionAnsweringPreLN(bert_config, args)
    else:
        model = BertForQuestionAnswering(bert_config, args)

    print("VOCAB SIZE:", bert_config.vocab_size)
    if args.model_file is not "0":
        logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}")

        if args.ckpt_type == "DS":
            checkpoint_state_dict = torch.load(
                args.model_file, map_location=torch.device("cpu"))
            if 'module' in checkpoint_state_dict:
                logger.info('Loading DeepSpeed v2.0 style checkpoint')
                model.load_state_dict(checkpoint_state_dict['module'],
                                      strict=False)
            elif 'model_state_dict' in checkpoint_state_dict:
                model.load_state_dict(
                    checkpoint_state_dict['model_state_dict'], strict=False)
            else:
                raise ValueError("Unable to find model state in checkpoint")
        else:
            from convert_bert_ckpt_to_deepspeed import convert_ckpt_to_deepspeed
            convert_ckpt_to_deepspeed(model, args.ckpt_type, args.model_file,
                                      vocab_diff,
                                      args.deepspeed_transformer_kernel)

        logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}")

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    if args.deepspeed_transformer_kernel:
        no_decay = no_decay + [
            'attn_nw', 'attn_nb', 'norm_w', 'norm_b', 'attn_qkvb', 'attn_ob',
            'inter_b', 'output_b'
        ]
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    model, optimizer, _, _ = deepspeed.initialize(
        args=args,
        model=model,
        model_parameters=optimizer_grouped_parameters,
        dist_init_required=True)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        #torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        os.makedirs(args.output_dir, exist_ok=True)

    # Prepare Summary writer
    if torch.distributed.get_rank() == 0 and args.job_name is not None:
        args.summary_writer = get_summary_writer(name=args.job_name,
                                                 base=args.output_dir)
    else:
        args.summary_writer = None

    logger.info("propagate deepspeed-config settings to client settings")
    args.train_batch_size = model.train_micro_batch_size_per_gpu()
    args.gradient_accumulation_steps = model.gradient_accumulation_steps()
    args.fp16 = model.fp16_enabled()
    args.print_steps = model.steps_per_print()
    args.learning_rate = model.get_lr()[0]
    args.wall_clock_breakdown = model.wall_clock_breakdown()

    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()

    global_step = 0
    if args.do_train:
        cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format(
            list(filter(None, args.bert_model.split('/'))).pop(),
            str(args.max_seq_length), str(args.doc_stride),
            str(args.max_query_length))
        train_features = None
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        ema_loss = 0.
        sample_count = 0
        num_epoch = 0
        global all_step_time
        ave_rounds = 20
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            num_epoch += 1
            epoch_step = 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", smoothing=0)):
                start_time = time.time()
                bs_size = batch[0].size()[0]
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch

                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                ema_loss = args.loss_plot_alpha * ema_loss + (
                    1 - args.loss_plot_alpha) * loss.item()

                model.backward(loss)
                loss_item = loss.item() * args.gradient_accumulation_steps
                loss = None

                sample_count += (args.train_batch_size *
                                 torch.distributed.get_world_size())

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step

                    model.step()
                    global_step += 1
                    epoch_step += 1

                    if torch.distributed.get_rank(
                    ) == 0 and args.summary_writer:
                        summary_events = [
                            (f'Train/Steps/lr', lr_this_step, global_step),
                            (f'Train/Samples/train_loss', loss_item,
                             sample_count),
                            (f'Train/Samples/lr', lr_this_step, sample_count),
                            (f'Train/Samples/train_ema_loss', ema_loss,
                             sample_count)
                        ]

                        if args.fp16 and hasattr(optimizer, 'cur_scale'):
                            summary_events.append(
                                (f'Train/Samples/scale', optimizer.cur_scale,
                                 sample_count))
                        write_summary_events(args.summary_writer,
                                             summary_events)
                        args.summary_writer.flush()

                    if torch.distributed.get_rank() == 0 and (
                            step + 1) % args.print_steps == 0:
                        logger.info(
                            f"bert_squad_progress: step={global_step} lr={lr_this_step} loss={ema_loss}"
                        )
                else:
                    model.step()

                if is_time_to_exit(args=args,
                                   epoch_steps=epoch_step,
                                   global_steps=global_step):
                    logger.info(
                        f'Warning: Early epoch termination due to max steps limit, epoch step ={epoch_step}, global step = {global_step}, epoch = {num_epoch}'
                    )
                    break
                one_step_time = time.time() - start_time
                all_step_time += one_step_time
                if (step + 1) % (
                        ave_rounds) == 0 and torch.distributed.get_rank() == 0:
                    print(
                        ' At step {}, averaged throughput for {} rounds is: {} Samples/s'
                        .format(
                            step, ave_rounds,
                            bs_size * ave_rounds *
                            torch.distributed.get_world_size() /
                            all_step_time),
                        flush=True)
                    all_step_time = 0.0

    # Save a trained model
    # model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
    #output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    # if args.do_train:
    #    torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned

    #model_state_dict = torch.load(output_model_file)
    #model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict)
    # model.to(device)

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        eval_examples = read_squad_examples(input_file=args.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, args.verbose_logging)
예제 #25
0
    if args.precision == 'bf16':
        af2features.dtype = torch.bfloat16
    elif args.precision == 'fp16':
        af2features.dtype = torch.float16

    af2 = AlphaFold(config=config.model,
                    target_dim=22,
                    msa_dim=49,
                    extra_msa_dim=25).to(device='cuda')

    os.environ['RANK'] = '0'
    os.environ['LOCAL_RANK'] = '0'
    os.environ['WORLD_SIZE'] = '1'
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '6000'
    deepspeed.init_distributed(auto_mpi_discovery=False)
    af2, optimizer, _, _ = deepspeed.initialize(
        model=af2,
        model_parameters=af2.parameters(),
        config=args.deepspeed_config_path,
        dist_init_required=True)

    with open(args.dataset_dir / args.sample_name, 'rb') as f:
        raw_features = pickle.load(f)

    batch = af2features(raw_features)
    if args.precision == 'fp16':
        batch = af2features.convert(batch,
                                    dtypes={
                                        torch.float32: torch.float16,
                                        torch.float64: torch.float32
def train(args):
    num_epochs = args.epochs
    local_rank = args.local_rank
    if local_rank == -1:
        local_rank = int(os.environ.get('PMIX_RANK', -1))

    deepspeed.init_distributed(timeout=timedelta(minutes=5))
    world_size = int(os.environ['WORLD_SIZE'])

    torch.manual_seed(0)

    # Set up standard model.
    if local_rank == 0:
        print('Using {} model'.format(args.model))
    model = getattr(models, args.model)()
    model = model.cuda()

    criterion = nn.CrossEntropyLoss().cuda()

    train_dataset = dataset_from_datadir(args.datadir)

    model_engine, optimizer, train_loader, __ = deepspeed.initialize(
        args=args,
        model=model,
        model_parameters=model.parameters(),
        training_data=train_dataset)

    # For final average
    avg_images = 0
    avg_start = None
    tot_steps = 0

    for epoch in range(num_epochs):
        for i, data in enumerate(train_loader):
            images = data[0].to(model_engine.local_rank)
            labels = data[1].to(model_engine.local_rank)

            outputs = model_engine(images)
            loss = criterion(outputs, labels)

            model_engine.backward(loss)
            model_engine.step()

            li = len(images)
            # last_images += li

            tot_steps += 1
            if tot_steps == args.warmup_steps:
                avg_start = datetime.now()
            elif tot_steps > args.warmup_steps:
                avg_images += li

            if args.steps is not None and tot_steps >= args.steps:
                break

    if local_rank == 0:
        if avg_start is None:
            print(
                "WARNING: stopped before warmup steps done, not printing stats."
            )
        else:
            dur = datetime.now() - avg_start
            print(f"Training completed in: {dur}")
            print(
                f"Images/sec: {avg_images*world_size/dur.total_seconds():.2f} "
                f"(average, skipping {args.warmup_steps} warmup steps)")
예제 #27
0
    net = AlexNet(num_classes=10)
    net = PipelineModule(layers=join_layers(net),
                         loss_fn=torch.nn.CrossEntropyLoss(),
                         num_stages=args.pipeline_parallel_size,
                         partition_method=part,
                         activation_checkpoint_interval=0)

    trainset = cifar_trainset(args.local_rank)

    engine, _, _, _ = deepspeed.initialize(
        args=args,
        model=net,
        model_parameters=[p for p in net.parameters() if p.requires_grad],
        training_data=trainset)

    for step in range(args.steps):
        loss = engine.train_batch()


if __name__ == '__main__':
    args = get_args()

    deepspeed.init_distributed(dist_backend=args.backend)
    args.local_rank = int(os.environ['LOCAL_RANK'])
    torch.cuda.set_device(args.local_rank)

    if args.pipeline_parallel_size == 0:
        train_base(args)
    else:
        train_pipe(args)
예제 #28
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    finetune_opts(parser)

    parser.add_argument("--world_size",
                        type=int,
                        default=1,
                        help="Total number of processes (GPUs) for training.")

    tokenizer_opts(parser)

    parser.add_argument("--soft_targets",
                        action='store_true',
                        help="Train model with logits.")
    parser.add_argument("--soft_alpha",
                        type=float,
                        default=0.5,
                        help="Weight of the soft targets loss.")

    deepspeed_opts(parser)

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Count the number of labels.
    args.labels_num = count_labels_num(args.train_path)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model.
    model = Classifier(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    # Get logger.
    args.logger = init_logger(args)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "gamma", "beta"]
    optimizer_grouped_parameters = [{
        "params":
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay":
        0.01
    }, {
        "params":
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay":
        0.0
    }]

    deepspeed.init_distributed()
    rank = dist.get_rank()
    args.rank = rank

    trainset = read_dataset(args, args.train_path, split=True)[args.rank]
    random.shuffle(trainset)
    instances_num = len(trainset)
    batch_size = args.batch_size
    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    custom_optimizer, custom_scheduler = build_optimizer(args, model)

    model, optimizer, _, scheduler = deepspeed.initialize(
        model=model,
        model_parameters=optimizer_grouped_parameters,
        args=args,
        optimizer=custom_optimizer,
        lr_scheduler=custom_scheduler,
        mpu=None,
        dist_init_required=False)

    src = torch.LongTensor([example[0] for example in trainset])
    tgt = torch.LongTensor([example[1] for example in trainset])
    seg = torch.LongTensor([example[2] for example in trainset])
    if args.soft_targets:
        soft_tgt = torch.FloatTensor([example[3] for example in trainset])
    else:
        soft_tgt = None

    args.model = model
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    total_loss, result, best_result, best_epoch = 0.0, 0.0, 0.0, 0

    result_tensor = torch.tensor(result).to(args.device)
    if args.rank == 0:
        args.logger.info("Batch size: {}".format(batch_size))
        args.logger.info(
            "The number of training instances: {}".format(instances_num))
        args.logger.info("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate(
                batch_loader(batch_size, src, tgt, seg, soft_tgt)):
            loss = train_model(args, model, optimizer, scheduler, src_batch,
                               tgt_batch, seg_batch, soft_tgt_batch)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0 and args.rank == 0:
                args.logger.info(
                    "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                    format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0
        if args.rank == 0:
            result = evaluate(args,
                              read_dataset(args, args.dev_path, split=False))
            result_tensor = torch.tensor(result[0]).to(args.device)
        dist.broadcast(result_tensor, 0, async_op=False)
        if result_tensor.float() >= best_result:
            best_result = result_tensor.float().item()
            best_epoch = epoch
        model.save_checkpoint(args.output_model_path, str(epoch))

    # Evaluation phase.
    if args.test_path is not None and args.rank == 0:
        args.logger.info("Test set evaluation.")
        model.load_checkpoint(args.output_model_path, str(best_epoch))
        evaluate(args, read_dataset(args, args.test_path, split=False), True)
예제 #29
0
    parser.add_argument("--mp_size",
                        type=int,
                        default=1,
                        help="Model parallel size.")

    args = parser.parse_args()

    args.batch_size = 1

    args = load_hyperparam(args)

    args.tokenizer = str2tokenizer[args.tokenizer](args)

    model = GenerateLm(args)
    model = load_model(model, args.load_model_path)
    deepspeed.init_distributed()
    model = deepspeed.init_inference(model=model,
                                     mp_size=args.mp_size,
                                     replace_method=None)

    rank = dist.get_rank()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if rank == 0:
        model.eval()

        with open(args.test_path, mode="r", encoding="utf-8") as f:
            line = f.readline().strip()
            src = args.tokenizer.convert_tokens_to_ids(
                [CLS_TOKEN] + args.tokenizer.tokenize(line))
            seg = [1] * len(src)
            beginning_length = len(src)
예제 #30
0
def main_worker(save_dir, args):
    # basic setup
    cudnn.benchmark = True

    if args.log_name is not None:
        log_dir = "runs/%s" % args.log_name
    else:
        log_dir = f"runs/{datetime.datetime.now().strftime('%m-%d-%H-%M-%S')}"

    if args.local_rank == 0:
        logger = SummaryWriter(log_dir)
    else:
        logger = None

    deepspeed.init_distributed(dist_backend='nccl')
    torch.cuda.set_device(args.local_rank)

    model = SetVAE(args)
    parameters = model.parameters()

    n_parameters = sum(p.numel() for p in parameters if p.requires_grad)
    print(f'number of params: {n_parameters}')
    try:
        n_gen_parameters = sum(p.numel() for p in model.init_set.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.pre_decoder.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.decoder.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.post_decoder.parameters() if p.requires_grad) + \
                           sum(p.numel() for p in model.output.parameters() if p.requires_grad)
        print(f'number of generator params: {n_gen_parameters}')
    except AttributeError:
        pass

    optimizer, criterion = model.make_optimizer(args)

    # initialize datasets and loaders
    train_dataset, val_dataset, train_loader, val_loader = get_datasets(args)

    # initialize the learning rate scheduler
    if args.scheduler == 'exponential':
        assert not (args.warmup_epochs > 0)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(
            optimizer, args.exp_decay)
    elif args.scheduler == 'step':
        assert not (args.warmup_epochs > 0)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    step_size=args.epochs // 2,
                                                    gamma=0.1)
    elif args.scheduler == 'linear':

        def lambda_rule(ep):
            lr_w = min(1., ep /
                       args.warmup_epochs) if (args.warmup_epochs > 0) else 1.
            lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float(
                0.5 * args.epochs)
            return lr_l * lr_w

        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                      lr_lambda=lambda_rule)
    elif args.scheduler == 'cosine':
        assert not (args.warmup_epochs > 0)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=args.epochs)
    else:
        # Fake SCHEDULER
        def lambda_rule(ep):
            return 1.0

        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                      lr_lambda=lambda_rule)

    # extract collate_fn
    if args.distributed:
        collate_fn = deepcopy(train_loader.collate_fn)
        model, optimizer, train_loader, scheduler = deepspeed.initialize(
            args=args,
            model=model,
            optimizer=optimizer,
            model_parameters=parameters,
            training_data=train_dataset,
            collate_fn=collate_fn,
            lr_scheduler=scheduler)

    # resume checkpoints
    start_epoch = 0
    if args.resume_checkpoint is None and Path(
            Path(save_dir) / 'checkpoint-latest.pt').exists():
        args.resume_checkpoint = os.path.join(
            save_dir, 'checkpoint-latest.pt')  # use the latest checkpoint
        print('Resumed from: ' + args.resume_checkpoint)
    if args.resume_checkpoint is not None:
        if args.distributed:
            if args.resume_optimizer:
                model.module, model.optimizer, model.lr_scheduler, start_epoch = resume(
                    args.resume_checkpoint,
                    model.module,
                    model.optimizer,
                    scheduler=model.lr_scheduler,
                    strict=(not args.resume_non_strict))
            else:
                model.module, _, _, start_epoch = resume(
                    args.resume_checkpoint,
                    model.module,
                    optimizer=None,
                    strict=(not args.resume_non_strict))
        else:
            if args.resume_optimizer:
                model, optimizer, scheduler, start_epoch = resume(
                    args.resume_checkpoint,
                    model,
                    optimizer,
                    scheduler=scheduler,
                    strict=(not args.resume_non_strict))
            else:
                model, _, _, start_epoch = resume(
                    args.resume_checkpoint,
                    model,
                    optimizer=None,
                    strict=(not args.resume_non_strict))

    # save dataset statistics
    if args.local_rank == 0:
        train_dataset.save_statistics(save_dir)
        val_dataset.save_statistics(save_dir)

    # main training loop
    avg_meters = {
        'kl_avg_meter': AverageValueMeter(),
        'l2_avg_meter': AverageValueMeter()
    }

    assert args.distributed

    epoch = start_epoch
    print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs))
    for epoch in range(start_epoch, args.epochs):
        if args.local_rank == 0:
            # evaluate on the validation set
            if epoch % args.val_freq == 0 and epoch != 0:
                model.eval()
                with torch.no_grad():
                    val_res = validate(model.module, args, val_loader, epoch,
                                       logger, save_dir)
                    for k, v in val_res.items():
                        v = v.cpu().detach().item()
                        send_slack(f'{k}:{v}, Epoch {epoch - 1}')
                        if logger is not None and v is not None:
                            logger.add_scalar(f'val_sample/{k}', v, epoch - 1)

        # train for one epoch
        train_one_epoch(epoch, model, criterion, optimizer, args, train_loader,
                        avg_meters, logger)

        # Only on HEAD process
        if args.local_rank == 0:
            # save checkpoints
            if (epoch + 1) % args.save_freq == 0:
                if args.eval:
                    validate_reconstruct_l2(epoch, val_loader, model,
                                            criterion, args, logger)
                save(model.module, model.optimizer, model.lr_scheduler,
                     epoch + 1,
                     Path(save_dir) / f'checkpoint-{epoch}.pt')
                save(model.module, model.optimizer, model.lr_scheduler,
                     epoch + 1,
                     Path(save_dir) / 'checkpoint-latest.pt')

            # save visualizations
            if (epoch + 1) % args.viz_freq == 0:
                with torch.no_grad():
                    visualize(model.module, args, val_loader, epoch, logger)

        # adjust the learning rate
        model.lr_scheduler.step()
        if logger is not None and args.local_rank == 0:
            logger.add_scalar('train lr',
                              model.lr_scheduler.get_last_lr()[0], epoch)

    model.eval()
    if args.local_rank == 0:
        with torch.no_grad():
            val_res = validate(model.module, args, val_loader, epoch, logger,
                               save_dir)
            for k, v in val_res.items():
                v = v.cpu().detach().item()
                send_slack(f'{k}:{v}, Epoch {epoch}')
                if logger is not None and v is not None:
                    logger.add_scalar(f'val_sample/{k}', v, epoch)

    if logger is not None:
        logger.flush()
        logger.close()