예제 #1
0
def generate_samples(model, tokenizer, args):
    print (f"generate_samples was called with model {model} \n and tokenizer {tokenizer}")
    model.eval()
    with torch.no_grad():
        while True:
            torch.distributed.barrier(group=mpu.get_model_parallel_group())
            terminate_runs = 0
            print (f"terminate_runs = {terminate_runs}")

            if mpu.get_model_parallel_rank() == 0:
                print ("get_model_parallel_rank() was 0")
#                 raw_text = input("\nContext prompt (stop to exit) >>> ")
                raw_text = "localStorage.getItem("
                while not raw_text:
                    print('Prompt should not be empty!')
                    raw_text = input("\nContext prompt (stop to exit) >>> ")

                if "stop" in raw_text:
                    terminate_runs = 1
                else:
                    context_tokens = tokenizer(raw_text)['input_ids']
                    context_length = len(context_tokens)

                    if context_length >= args.seq_length // 2:
                        print("\nContext length", context_length,
                              "\nPlease give smaller context (half of the sequence length)!")
                        continue
            else:
                print (f"get_model_parallel_rank() was NOT 0 but {mpu.get_model_parallel_rank()}")
                _ = tokenizer("EMPTY TEXT")['input_ids']

            terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
            torch.distributed.broadcast(terminate_runs_tensor, mpu.get_model_parallel_src_rank(),
                                        group=mpu.get_model_parallel_group())
            terminate_runs = terminate_runs_tensor[0].item()

            if terminate_runs == 1:
                return

            start_time = time.time()
            print ("generating...")
            generated = generate(
                model, tokenizer, raw_text,
                out_seq_length=args.out_seq_length,
                seq_length=args.seq_length,
                temperature=args.temperature,
                top_k=args.top_k,
                top_p=args.top_p
            )

            if mpu.get_model_parallel_rank() == 0:
                print ("We should clear the terminal and print results...")
                os.system('clear')
                print("\nTime taken: {:.2f}\n".format(time.time() - start_time), flush=True)
                print("\nContext:", raw_text, flush=True)
                print("\nGPT:", generated, flush=True)
            raw_text = None

            torch.distributed.barrier(group=mpu.get_model_parallel_group())
def generate_samples(model, tokenizer, args):
    model.eval()
    with torch.no_grad():
        while True:
            torch.distributed.barrier(group=mpu.get_model_parallel_group())
            terminate_runs = 0

            if mpu.get_model_parallel_rank() == 0:
                raw_text = input("\nContext prompt (stop to exit) >>> ")
                while not raw_text:
                    print('Prompt should not be empty!')
                    raw_text = input("\nContext prompt (stop to exit) >>> ")

                if "stop" in raw_text:
                    terminate_runs = 1
                else:
                    context_tokens = tokenizer(raw_text)['input_ids']
                    context_length = len(context_tokens)

                    if context_length >= args.seq_length // 2:
                        print(
                            "\nContext length", context_length,
                            "\nPlease give smaller context (half of the sequence length)!"
                        )
                        continue
            else:
                _ = tokenizer("EMPTY TEXT")['input_ids']

            terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
            torch.distributed.broadcast(terminate_runs_tensor,
                                        mpu.get_model_parallel_src_rank(),
                                        group=mpu.get_model_parallel_group())
            terminate_runs = terminate_runs_tensor[0].item()

            if terminate_runs == 1:
                return

            start_time = time.time()
            generated = generate(model,
                                 tokenizer,
                                 raw_text,
                                 out_seq_length=args.out_seq_length,
                                 seq_length=args.seq_length,
                                 temperature=args.temperature,
                                 top_k=args.top_k,
                                 top_p=args.top_p)

            if mpu.get_model_parallel_rank() == 0:
                os.system('clear')
                print("\nTaken time {:.2f}\n".format(time.time() - start_time),
                      flush=True)
                print("\nContext:", raw_text, flush=True)
                print("\nGPT:", generated, flush=True)
            raw_text = None

            torch.distributed.barrier(group=mpu.get_model_parallel_group())
def get_model(args):
    """Build the model."""

    print_rank_0('building GPT3 model ...')
    model = GPT3Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=False)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    model = DDP(model)

    return model
예제 #4
0
def get_train_val_test_data(args):
    """Load the data on rank zero and boradcast number of tokens to all GPUS."""

    (train_data, val_data, test_data) = (None, None, None)

    # Data loader only on rank 0 of each model parallel group.
    if mpu.get_model_parallel_rank() == 0:
        (train_data, val_data, test_data), num_tokens, eod_token, tokenizer = make_gpt3_dataloaders(args)
        before = num_tokens
        after = before
        multiple = args.make_vocab_size_divisible_by * mpu.get_model_parallel_world_size()
        while (after % multiple) != 0:
            after += 1
        print_rank_0(
            '> padded vocab (size: {}) with {} dummy tokens (new size: {})'.format(before, after - before, after))
        print_rank_0('> end-of-document token: {}'.format(eod_token))
        token_counts = torch.cuda.LongTensor(
            [after, eod_token, int(args.do_train), int(args.do_valid), int(args.do_test)])
    else:
        tokenizer = None
        token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0])

    # Broadcast num tokens.
    torch.distributed.broadcast(token_counts,
                                mpu.get_model_parallel_src_rank(),
                                group=mpu.get_model_parallel_group())
    num_tokens = token_counts[0].item()
    eod_token = token_counts[1].item()
    args.do_train = token_counts[2].item()
    args.do_valid = token_counts[3].item()
    args.do_test = token_counts[4].item()

    return train_data, val_data, test_data, num_tokens, eod_token, tokenizer
예제 #5
0
    def __init__(self, module):
        super(DistributedDataParallel, self).__init__()
        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False

        self.module = module
        self.data_parallel_group = mpu.get_data_parallel_group()
        src_rank = mpu.get_model_parallel_rank()
        for p in self.module.parameters():
            if torch.is_tensor(p):
                dist.broadcast(p, src_rank, group=self.data_parallel_group)

        def allreduce_params(reduce_after=True,
                             no_scale=False,
                             fp32_allreduce=False):
            if (self.needs_reduction):
                self.needs_reduction = False
                buckets = {}
                for name, param in self.module.named_parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = (param.data.type())
                        if tp not in buckets:
                            buckets[tp] = []
                        buckets[tp].append(param)
                if self.warn_on_half:
                    if torch.cuda.HalfTensor in buckets:
                        print(
                            "WARNING: gloo dist backend for half parameters may be extremely slow."
                            +
                            " It is recommended to use the NCCL backend in this case."
                        )
                        self.warn_on_half = False
                for tp in buckets:
                    bucket = buckets[tp]
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    if fp32_allreduce:
                        coalesced = coalesced.float()
                    if not no_scale and not reduce_after:
                        coalesced /= dist.get_world_size(
                            group=self.data_parallel_group)
                    dist.all_reduce(coalesced, group=self.data_parallel_group)
                    torch.cuda.synchronize()
                    if not no_scale and reduce_after:
                        coalesced /= dist.get_world_size(
                            group=self.data_parallel_group)
                    for buf, synced in zip(
                            grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)

        self.hook_handles = []
        self.hooks = []
        for param in list(self.module.parameters()):

            def allreduce_hook(*unused):
                Variable._execution_engine.queue_callback(allreduce_params)

        #    handle = param.register_hook(allreduce_hook)
        #self.hooks.append(allreduce_hook)
        #self.hook_handles.append(handle)
        self.allreduce_params = allreduce_params
예제 #6
0
def get_model(args):
    """Build the model."""

    print_rank_0('building GPT3 model ...')
    assert args.num_attention_heads % args.model_parallel_size == 0
    num_local_heads = args.num_attention_heads // args.model_parallel_size
    deepspeed_sparsity_config = None
    if DEEPSPEED_WRAP and args.deepspeed:
        deepspeed_sparsity_config = get_sparse_attention_config(args, num_local_heads)
    if deepspeed_sparsity_config is not None:
        print_rank_0(f"Use sparse attention with mode {args.sparse_mode}")
    model = GPT3Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True,
                      deepspeed_sparsity_config=deepspeed_sparsity_config,
                      sparse_mode=args.sparse_mode)

    if args.load_huggingface is not None:
        model = load_huggingface_model(model, args.load_huggingface, args.huggingface_double_pos_embeddings)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if DEEPSPEED_WRAP and args.deepspeed and args.fp16:
        model.half()

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if USE_TORCH_DDP:
        i = torch.cuda.current_device()
        model = DDP(model, device_ids=[i], output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        model = DDP(model)

    return model
예제 #7
0
def get_checkpoint_name(checkpoints_path,
                        iteration,
                        release=False,
                        zero=False):
    if release:
        d = 'release'
    else:
        d = 'iter_{:07d}'.format(iteration)
    if zero:
        dp_rank = mpu.get_data_parallel_rank()
        d += '_zero_dp_rank_{}'.format(dp_rank)
    return os.path.join(checkpoints_path, d,
                        'mp_rank_{:02d}'.format(mpu.get_model_parallel_rank()),
                        'model_optim_rng.pt')
예제 #8
0
def get_model(args):
    """Build the model."""

    print_rank_0('building GPT3 model ...')
    print ("Calling GPT3Model constructor...")  
    model = GPT3Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=False)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    # GPU allocation.
    print (f"placing the model on device {torch.cuda.current_device()}")
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        rint ("we have NOT halfed the model before, and now we're wrapping it into a fp16_module. For...some reason...")
        model = FP16_Module(model)

    # Wrap model for distributed training.
    print ("Setting up distributed training...")
    print ("No classic pytorch DDP this time; \nUsing sberbank magic DDP")
    model = DDP(model)

    input ("ready to return model")
    return model
예제 #9
0
def get_model(args):
    """Build the model."""

    print_rank_0('building GPT3 model ...')
    print ("asserting we have a correct number of attention heads...")
    assert args.num_attention_heads % args.model_parallel_size == 0
    num_local_heads = args.num_attention_heads // args.model_parallel_size
    deepspeed_sparsity_config = None
    if DEEPSPEED_WRAP and args.deepspeed:
        print ("we're using deepspeed, and so we're getting a sparse attention config")
        deepspeed_sparsity_config = get_sparse_attention_config(args, num_local_heads)
    if deepspeed_sparsity_config is not None:
        print_rank_0(f"Using sparse attention with mode {args.sparse_mode}")
    print ("Calling GPT3Model constructor...")    
    model = GPT3Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True,
                      deepspeed_sparsity_config=deepspeed_sparsity_config,
                      sparse_mode=args.sparse_mode)

    if args.load_huggingface is not None:
        print ("Loading huggingface model...")
        model = load_huggingface_model(model, args.load_huggingface, args.huggingface_double_pos_embeddings)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if DEEPSPEED_WRAP and args.deepspeed and args.fp16:
        print ("We've had deepspeed AND fp16, so we're halfing the model...")
        model.half()

    # GPU allocation.
    print (f"placing the model on device {torch.cuda.current_device()}")
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        print ("we've halfed the model before, but now we're wrapping it into a fp16_module. For...some reason...")
        model = FP16_Module(model)

    # Wrap model for distributed training.
    print ("Setting up distributed training...")
    if USE_TORCH_DDP:
        i = torch.cuda.current_device()
        print (f"Using classic pytorch DDP with device {i}")
        model = DDP(model, device_ids=[i], output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        print ("Using sberbank magic DDP")
        model = DDP(model)

#     input ("ready to return model")
    print ("ready to return model")
    return model
예제 #10
0
파일: utils.py 프로젝트: xbodx/ru-gpts
def load_checkpoint(model, optimizer, lr_scheduler, args, deepspeed=False):
    """Load a model checkpoint."""

    iteration, release, success = get_checkpoint_iteration(args)

    if not success:
        return 0

    if deepspeed:
        load_optim = not args.no_load_optim
        checkpoint_name, sd = model.load_checkpoint(
            args.load,
            iteration,
            load_optimizer_states=load_optim,
            load_lr_scheduler_states=load_optim)

        if checkpoint_name is None:
            if mpu.get_data_parallel_rank() == 0:
                print("Unable to load checkpoint.")
            return iteration

    else:

        # Checkpoint.
        checkpoint_name = get_checkpoint_name(args.load, iteration, release)

        # Load the checkpoint.
        if os.path.isfile(checkpoint_name):
            sd = torch.load(checkpoint_name, map_location='cpu')
        else:
            # Try load deepspeed checkpoint with only megatron
            checkpoint_name = os.path.join(
                args.load, str(iteration),
                'mp_rank_{:02d}_model_states.pt'.format(
                    mpu.get_model_parallel_rank()))
            sd = torch.load(checkpoint_name, map_location='cpu')

        if mpu.get_data_parallel_rank() == 0:
            print('global rank {} is loading checkpoint {}'.format(
                torch.distributed.get_rank(), checkpoint_name))

        if isinstance(model, torchDDP):
            model = model.module

        # Model.
        try:
            model.load_state_dict(sd['model'])
        except KeyError:
            try:
                model.load_state_dict(sd['module'])
            except KeyError:
                print_rank_0(
                    'A metadata file exists but unable to load model '
                    'from checkpoint {}, exiting'.format(checkpoint_name))
                exit()

        # Optimizer.
        if not release and not args.finetune and not args.no_load_optim:
            try:
                if optimizer is not None:
                    optimizer.load_state_dict(sd['optimizer'])
                if lr_scheduler is not None:
                    lr_scheduler.load_state_dict(sd['lr_scheduler'])
            except KeyError:
                print_rank_0(
                    'Unable to load optimizer from checkpoint {}, exiting. '
                    'Specify --no-load-optim or --finetune to prevent '
                    'attempting to load the optimizer '
                    'state.'.format(checkpoint_name))
                exit()

    # Iterations.
    if args.finetune or release:
        iteration = 0
    else:
        try:
            iteration = sd['iteration']
        except KeyError:
            try:  # Backward compatible with older checkpoints
                iteration = sd['total_iters']
            except KeyError:
                print_rank_0(
                    'A metadata file exists but Unable to load iteration '
                    ' from checkpoint {}, exiting'.format(checkpoint_name))
                exit()

    # rng states.
    if not release and not args.finetune and not args.no_load_rng:
        try:
            random.setstate(sd['random_rng_state'])
            np.random.set_state(sd['np_rng_state'])
            torch.set_rng_state(sd['torch_rng_state'])
            torch.cuda.set_rng_state(sd['cuda_rng_state'])
            mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states'])
        except KeyError:
            print_rank_0(
                'Unable to load optimizer from checkpoint {}, exiting. '
                'Specify --no-load-optim or --finetune to prevent '
                'attempting to load the optimizer '
                'state.'.format(checkpoint_name))
            exit()

    torch.distributed.barrier()
    if mpu.get_data_parallel_rank() == 0:
        print('  successfully loaded {}'.format(checkpoint_name))

    return iteration