示例#1
0
def setup_model_and_optimizer(model_provider_func):
    """Setup model and optimizer."""
    args = get_args()

    model = get_model(model_provider_func)
    optimizer = get_optimizer(model)
    lr_scheduler = get_learning_rate_scheduler(optimizer)

    if args.deepspeed:
        print_rank_0("DeepSpeed is enabled.")

        model, optimizer, _, lr_scheduler = deepspeed.initialize(
            model=model,
            optimizer=optimizer,
            args=args,
            lr_scheduler=lr_scheduler,
            mpu=mpu,
            dist_init_required=False)
    if args.load is not None:
        args.iteration = load_checkpoint(model, optimizer, lr_scheduler)
    else:
        args.iteration = 0

    # get model without FP16 and/or TorchDDP wrappers
    unwrapped_model = model
    while hasattr(unwrapped_model, 'module'):
        unwrapped_model = unwrapped_model.module

    if args.iteration == 0 and hasattr(unwrapped_model,
                                       'init_state_dict_from_bert'):
        print("Initializing ICT from pretrained BERT model", flush=True)
        unwrapped_model.init_state_dict_from_bert()

    return model, optimizer, lr_scheduler
示例#2
0
文件: evaluate.py 项目: yf1291/nlp3
def main():
    """Main program."""
    args = get_args()

    if args.task == 'LAMBADA':
        eval_metric = 'accuracy'
    elif args.task == 'WIKITEXT103':
        eval_metric = 'loss'
    else:
        raise NotImplementedError('{} task is not implemented.'.format(
            args.task))

    # Set up model and load checkpoint.
    model = get_model(get_model_provider(eval_metric))
    if args.load is not None:
        _ = load_checkpoint(model, None, None)

    # Data stuff.
    dataset = build_dataset(args.task)
    dataloader = build_data_loader(dataset,
                                   args.batch_size,
                                   args.num_workers,
                                   drop_last=False)

    # Run evaluation.
    evaluate_and_print_results(args.task, dataloader, model, eval_metric)

    print_rank_0('done :-)')
def main():
    """Main program."""

    initialize_megatron(extra_args_provider=add_text_generate_args,
                        args_defaults={
                            'tokenizer_type': 'GPT2BPETokenizer',
                            'no_load_rng': True,
                            'no_load_optim': True
                        })

    args = get_args()
    if args.num_layers_per_virtual_pipeline_stage is not None:
        print(
            "Interleaved pipeline schedule is not yet supported for text generation."
        )
        exit()

    # Set up model and load checkpoint.
    model = get_model(model_provider)

    if args.load is not None:
        _ = load_checkpoint(model, None, None)

    assert len(model) == 1, "Above condition should have caught this"
    model = model[0]

    # Generate samples.
    if args.num_samples == 0:
        args.micro_batch_size = 1
        if args.sample_input_file != None:
            generate_samples_input_from_file(model)
        else:
            generate_samples_interactive(model)
    else:
        generate_and_write_samples_unconditional(model)
示例#4
0
def setup_model_and_optimizer(model_provider_func):
    """Setup model and optimizer."""
    args = get_args()

    model = get_model(model_provider_func)
    optimizer, param_groups = get_optimizer(model)
    lr_scheduler = get_learning_rate_scheduler(optimizer)

    if args.deepspeed:
        print_rank_0("DeepSpeed is enabled.")

        model, optimizer, _, lr_scheduler = deepspeed.initialize(
            model=model,
            optimizer=optimizer,
            args=args,
            lr_scheduler=lr_scheduler,
            mpu=mpu if args.pipe_parallel_size == 0 else None,
            dist_init_required=False,
            model_parameters=param_groups if optimizer is None else None)

        if args.pipe_parallel_size > 0:
            model.set_batch_fn(model.module._megatron_batch_fn)

    if args.load is not None:
        args.iteration = load_checkpoint(model, optimizer, lr_scheduler)
    else:
        args.iteration = 0

    # get model without FP16 and/or TorchDDP wrappers
    unwrapped_model = model
    while hasattr(unwrapped_model, 'module'):
        unwrapped_model = unwrapped_model.module

    return model, optimizer, lr_scheduler
示例#5
0
def main():
    """Main program."""
    args = get_args()

    if args.num_layers_per_virtual_pipeline_stage is not None:
        print("Interleaved pipeline schedule is not yet supported for text generation.")
        exit()

    if args.task == 'LAMBADA':
        eval_metric = 'accuracy'
    elif args.task == 'WIKITEXT103':
        eval_metric = 'loss'
    else:
        raise NotImplementedError('{} task is not implemented.'.format(
            args.task))

    # Set up model and load checkpoint.
    model = get_model(get_model_provider(eval_metric), wrap_with_ddp=False)
    if args.load is not None:
        _ = load_checkpoint(model, None, None)

    assert len(model) == 1, "Above condition should have caught this"
    model = model[0]

    # Data stuff.
    dataset = build_dataset(args.task)
    dataloader = build_data_loader(dataset, args.micro_batch_size,
                                   args.num_workers, drop_last=False)

    # Run evaluation.
    evaluate_and_print_results(args.task, dataloader, model, eval_metric)

    print_rank_0('done :-)')
 def load(self, context: DeepSpeedTrialContext, path: pathlib.Path) -> None:
     self.neox_args.load = str(path)
     self.neox_args.iteration = load_checkpoint(
         neox_args=self.neox_args,
         model=self.model,
         optimizer=self.optimizer,
         lr_scheduler=self.lr_scheduler,
         inference=False,
     )
     megatron_utils.print_rank_0(
         f"Loading checkpoint and starting from iteration {self.neox_args.iteration}"
     )
示例#7
0
def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
    """Setup model and optimizer."""
    model = get_model(neox_args=neox_args, use_cache=use_cache)
    optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
    lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer,
                                               neox_args=neox_args)

    if neox_args.deepspeed:
        print_rank_0("DeepSpeed is enabled.")
        if neox_args.no_load_optim:
            assert optimizer is None
            _model_params = None
            _lr_scheduler = None
        else:
            _model_params = param_groups if optimizer is None else None
            _lr_scheduler = lr_scheduler

        model, optimizer, _, lr_scheduler = deepspeed.initialize(
            model=model,
            optimizer=optimizer,
            args=neox_args,
            lr_scheduler=_lr_scheduler,
            dist_init_required=False,
            model_parameters=_model_params,
            config_params=neox_args.deepspeed_config,
            mpu=mpu if not neox_args.is_pipe_parallel else None,
        )
        model.total_params = get_total_params(model.module)
        print_rank_0(f' > total params: {"{:,}".format(model.total_params)}')

        if neox_args.is_pipe_parallel:
            model.set_has_attention_mask(True)
            model.set_batch_fn(partial(get_batch_pipe, neox_args=neox_args))
    else:
        raise ValueError("Must be using deepspeed to run neox")

    if neox_args.load is not None:
        neox_args.iteration = load_checkpoint(
            neox_args=neox_args,
            model=model,
            optimizer=optimizer,
            lr_scheduler=lr_scheduler,
            iteration=iteration,
        )
        print_rank_0(
            f"Loading checkpoint and starting from iteration {neox_args.iteration}"
        )
    else:
        neox_args.iteration = 0

    return model, optimizer, lr_scheduler
示例#8
0
文件: training.py 项目: yf1291/nlp3
def setup_model_and_optimizer(model_provider_func):
    """Setup model and optimizer."""
    args = get_args()

    model = get_model(model_provider_func)
    optimizer = get_optimizer(model)
    lr_scheduler = get_learning_rate_scheduler(optimizer)

    if args.load is not None:
        args.iteration = load_checkpoint(model, optimizer, lr_scheduler)
    else:
        args.iteration = 0

    return model, optimizer, lr_scheduler
示例#9
0
def run_checkpoint_test(yaml_list=None, param_dict=None):

    from megatron.checkpointing import load_checkpoint
    from megatron.checkpointing import save_checkpoint

    model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list,
                                                              param_dict,
                                                              clear_data=True)

    # save model checkpoint
    save_checkpoint(
        neox_args=args_loaded,
        iteration=42,
        model=model,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
    )

    # reload model from checkpoint
    (
        reloaded_model,
        reloaded_optimizer,
        reloaded_lr_scheduler,
        args_reloaded,
    ) = model_setup(yaml_list, param_dict, clear_data=False)
    iteration = load_checkpoint(
        neox_args=args_reloaded,
        model=reloaded_model,
        optimizer=reloaded_optimizer,
        lr_scheduler=reloaded_lr_scheduler,
    )

    # ensure same checkpoint is loaded
    assert (iteration == 42
            ), "run_checkpoint_test() iteration loaded from checkpoint correct"

    # check all weight groups are the same
    for idx, ((n1, p1), (n2, p2)) in enumerate(
            zip(
                list(model.module.named_parameters()),
                list(reloaded_model.module.named_parameters()),
            )):
        assert n1 == n2
        params_equal = (p1 == p2).all().item()
        assert params_equal, "run_checkpoint_test() params equal: " + str(n1)
示例#10
0
def setup_model_and_optimizer(model_provider_func):
    """Setup model and optimizer."""
    args = get_args()

    model = get_model(model_provider_func)

    unwrapped_model = model
    while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16Module)):
        unwrapped_model = unwrapped_model.module
    optimizer = get_megatron_optimizer(unwrapped_model)

    lr_scheduler = get_learning_rate_scheduler(optimizer)

    if args.load is not None:
        timers = get_timers()
        # Extra barrier is added to make sure all ranks report the
        # max time.
        torch.distributed.barrier()
        timers('load checkpoint').start()
        args.iteration = load_checkpoint(model, optimizer, lr_scheduler)
        torch.distributed.barrier()
        timers('load checkpoint').stop()
        timers.log(['load checkpoint'])
    else:
        args.iteration = 0

    # We only support local DDP with multiple micro-batches.
    if get_num_microbatches() > 1:
        assert args.DDP_impl == 'local'

    # get model without FP16 and/or TorchDDP wrappers
    unwrapped_model = model
    while hasattr(unwrapped_model, 'module'):
        unwrapped_model = unwrapped_model.module

    if args.iteration == 0 and hasattr(unwrapped_model,
                                       'init_state_dict_from_bert'):
        print("Initializing ICT from pretrained BERT model", flush=True)
        unwrapped_model.init_state_dict_from_bert()

    return model, optimizer, lr_scheduler
示例#11
0
def main():
    """Main program."""

    initialize_megatron(extra_args_provider=add_text_generate_args,
                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})

    # Set up model and load checkpoint.
    model = get_model(model_provider)
    args = get_args()
    if args.load is not None:
        _ = load_checkpoint(model, None, None)

    # Generate samples.
    if args.num_samples == 0:
        args.batch_size = 1
        if args.sample_input_file != "":
            generate_samples_input_from_file(model)
        else:
            generate_samples_interactive(model)
    else:
        generate_and_write_samples_unconditional(model)
示例#12
0
def setup_model_and_optimizer(model_provider_func):
    """Setup model and optimizer."""
    args = get_args()

    model = get_model(model_provider_func)

    unwrapped_model = unwrap_model(model,
                                   (torchDDP, LocalDDP, Float16Module))
    optimizer = get_megatron_optimizer(unwrapped_model)

    lr_scheduler = get_learning_rate_scheduler(optimizer)

    if args.load is not None:
        timers = get_timers()
        # Extra barrier is added to make sure all ranks report the
        # max time.
        torch.distributed.barrier()
        timers('load-checkpoint').start()
        args.iteration = load_checkpoint(model, optimizer, lr_scheduler)
        torch.distributed.barrier()
        timers('load-checkpoint').stop()
        timers.log(['load-checkpoint'])
    else:
        args.iteration = 0

    # We only support local DDP with multiple micro-batches.
    if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1:
        assert args.DDP_impl == 'local'

    # get model without FP16 and/or TorchDDP wrappers
    if args.iteration == 0 and len(unwrapped_model) == 1 \
        and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'):
        print_rank_0("Initializing ICT from pretrained BERT model")
        unwrapped_model[0].init_state_dict_from_bert()
        if args.fp16:
            optimizer.reload_model_params()

    return model, optimizer, lr_scheduler
示例#13
0
def main():

    args = get_args()
    if args.api_prompt:
        # obtain the generations by calling the api
        generate_samples_by_calling_api()
        return

    if args.num_layers_per_virtual_pipeline_stage is not None:
        print(
            "Interleaved pipeline schedule is not yet supported for text generation."
        )
        exit()

    # Set up model and load checkpoint.
    model = get_model(model_provider, wrap_with_ddp=False)
    if args.load is not None:
        _ = load_checkpoint(model, None, None)

    assert len(model) == 1, "Above condition should have caught this"
    model = model[0]

    # perform the prompting
    generate_samples_by_prompting_input_from_file(model)
示例#14
0
def main():

    # Arguments do sanity checks on the world size, but we don't care,
    # so trick it into thinking we are plenty of processes
    os.environ["WORLD_SIZE"] = f'{2**31}'

    # Args
    set_global_variables(extra_args_provider=get_mp_merge_args,
                         args_defaults={
                             'use_cpu_initialization': True,
                             'micro_batch_size': 1,
                             'no_load_optim': True,
                             'no_load_rng': True,
                             'no_save_optim': True,
                             'no_save_rng': True,
                             'save_interval': 1
                         })
    args = get_args()

    if args.pipeline_model_parallel_size > 1:
        print(
            "Checkpoints with pipeline model parallelism are not currently supported."
        )
        exit()

    model_type = args.model_type
    orig_tensor_model_parallel_size = args.tensor_model_parallel_size
    args.tensor_model_parallel_size = 1
    tokenizer = rebuild_tokenizer(args)

    print('\n merging model parallel partitions ...')
    print(
        ' > number of partitions: {}'.format(orig_tensor_model_parallel_size))
    print(' > checkpoint path: {}'.format(args.load))
    print(' > model parameters:')
    print('    number of tokens ................ {} '.format(
        tokenizer.vocab_size))
    print('    number of layers ................ {}'.format(args.num_layers))
    print('    hidden size ..................... {}'.format(args.hidden_size))
    print('    number of attention heads ....... {}'.format(
        args.num_attention_heads))
    print('    maximum position embeddings ..... {}'.format(
        args.max_position_embeddings))

    # Full model.
    print('> building the full model ...')
    mpu.initialize.set_tensor_model_parallel_world_size(1)
    mpu.initialize.set_tensor_model_parallel_rank(0)
    mpu.initialize.set_pipeline_model_parallel_world_size(1)
    mpu.initialize.set_pipeline_model_parallel_rank(0)
    merged_model = get_model(model_type)

    # Build and load partitions.
    partitions = []
    iteration = 0
    args.tensor_model_parallel_size = orig_tensor_model_parallel_size
    tokenizer = rebuild_tokenizer(args)
    mpu.initialize.set_tensor_model_parallel_world_size(
        args.tensor_model_parallel_size)
    for rank in range(args.tensor_model_parallel_size):
        # Reset these since load_checkpoint asserts they are 0, but we are loading
        # multiple checkpoints in the same process and they get set each time
        args.consumed_train_samples = 0
        args.consumed_valid_samples = 0

        mpu.initialize.set_tensor_model_parallel_rank(rank)
        checkpoint_name, iteration = get_parallel_checkpoint_name(args.load)
        model_ = get_model(model_type)
        print(f'> loading {checkpoint_name} ...')
        load_checkpoint(model_, None, None)
        print(f'> checkpoint version {get_checkpoint_version()}')
        partitions.append(model_)

    # Parameter generators so we can loop through them semiltaneouly.
    merged_params_gen = merged_model.named_parameters()
    partitions_params_gen = [
        partition.named_parameters() for partition in partitions
    ]
    while True:
        try:

            # Get the params and check names.
            name, merged_param = next(merged_params_gen)
            print(' > working on {} ...'.format(name))
            print('     merged         type: {}, size: {}'.format(
                merged_param.dtype, list(merged_param.size())))
            partitions_param = []
            for rank, partition_params_gen in enumerate(partitions_params_gen):
                partition_name, partition_param = next(partition_params_gen)
                assert partition_name == name
                partitions_param.append(partition_param)
                print('     partition {}    type: {}, size: {}'.format(
                    rank, partition_param.dtype, list(partition_param.size())))

            # For the non-parallel parameters, simply copy the rank 0 values.
            if not hasattr(merged_param, 'tensor_model_parallel'):
                print('     none-parallel parameter, simple copy from rank 0')
                with torch.no_grad():
                    merged_param.data.copy_(partitions_param[0].data)
            # For parallel parameters, merge the values
            else:
                dim = merged_param.partition_dim
                stride = merged_param.partition_stride
                print(
                    f'     parallel parameter merge with stride {stride} along '
                    f'dimention {dim}')
                merge_partitions(merged_param, partitions_param, dim, stride)

        except StopIteration:
            break

    partitions = []
    args.tensor_model_parallel_size = 1
    args.pipeline_model_parallel_size = args.target_pipeline_model_parallel_size

    assert args.num_layers % args.pipeline_model_parallel_size == 0, \
        'num_layers must be divisible by target pipeline model parallel size'
    layers_per_part = args.num_layers // args.pipeline_model_parallel_size

    tokenizer = rebuild_tokenizer(args)
    mpu.initialize.set_tensor_model_parallel_world_size(
        args.tensor_model_parallel_size)
    mpu.initialize.set_tensor_model_parallel_rank(0)
    mpu.initialize.set_pipeline_model_parallel_world_size(
        args.pipeline_model_parallel_size)

    # regex to parse out layer number from param name
    layer_re = re.compile('layers\.([0-9]+)')

    if args.pipeline_model_parallel_size > 1:
        merged_params = {}
        for name, merged_param in merged_model.named_parameters():
            merged_params[name] = merged_param

        for rank in range(args.pipeline_model_parallel_size):
            mpu.initialize.set_pipeline_model_parallel_rank(rank)
            model = get_model(model_type)

            def update_layer_num(m):
                # TODO! This assumes no interleaved pipeline execution
                layer = int(m.group(1))
                layer += rank * layers_per_part
                return f'layers.{layer}'

            for dst_name, partition_param in model.named_parameters():
                if dst_name == "word_embeddings.weight":
                    # See comment in MegatronModule.initialize_word_embeddings()
                    src_name = "language_model.embedding.word_embeddings.weight"
                else:
                    # Translate destination layer number (0-N for each partition)
                    # to source layer number (single-model layer number)
                    src_name = re.sub(layer_re, update_layer_num, dst_name)
                print(
                    f" > copying {src_name} to {dst_name} in rank {rank}'s model"
                )
                partition_param.data.copy_(merged_params[src_name].data)

            partitions.append(model)
    else:
        partitions = [merged_model]

    for rank, model in enumerate(partitions):
        mpu.initialize.set_pipeline_model_parallel_rank(rank)
        print(f"> saving rank {rank}'s model")
        save_checkpoint(iteration, model, None, None)

    print('done :-)')
def finetune(train_valid_datasets_provider,
             model_provider,
             forward_step=_cross_entropy_forward_step,
             end_of_epoch_callback_provider=None):
    """Main finetune function used across all tasks."""
    args = get_args()
    timers = get_timers()

    # Train and validation data loaders.
    timers('train/valid/test dataset/dataloder').start()
    if args.epochs > 0:
        train_dataset, valid_dataset = train_valid_datasets_provider()
        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
            train_dataset, valid_dataset)
    timers('train/valid/test dataset/dataloder').stop()

    # Build calback function.
    timers('callback function').start()
    end_of_epoch_callback = None
    if end_of_epoch_callback_provider is not None:
        end_of_epoch_callback = end_of_epoch_callback_provider()
    timers('callback function').stop()

    # Build model, optimizer and learning rate scheduler.
    timers('model and optimizer').start()
    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
    timers('model and optimizer').stop()

    # If pretrained checkpoint is provided and we have not trained for
    # any iteration (i.e., iteration is zero), then load the pretrained
    # checkpoint.
    timers('pretrained checkpoint').start()
    if args.iteration == 0 and args.pretrained_checkpoint is not None:
        original_load = args.load
        args.load = args.pretrained_checkpoint
        _ = load_checkpoint(model, None, None)
        args.load = original_load
        # This is critical when only model is loaded. We should make sure
        # master parameters are also updated.
        if args.fp16:
            optimizer._model_params_to_master_params()
    timers('pretrained checkpoint').stop()

    # Print setup timing.
    print_rank_0('done with setups ...')
    timers.log([
        'train/valid/test dataset/dataloder', 'callback function',
        'model and optimizer', 'pretrained checkpoint'
    ])
    print_rank_0('training ...')

    # Finetune the model.
    if args.epochs > 0:
        _train(model, optimizer, lr_scheduler, forward_step, train_dataloader,
               valid_dataloader, end_of_epoch_callback)
    # Or just evaluate.
    else:
        if end_of_epoch_callback is not None:
            print_rank_0('evaluation only mode, setting epoch to -1')
            end_of_epoch_callback(model, epoch=-1, output_predictions=True)

    print_rank_0('done :-)')
示例#16
0
def main():
    """Main program."""
    drmode = 0
    mode = 0
    initialize_megatron(extra_args_provider=add_text_generate_args,
                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})

    # Set up model and load checkpoint.
    model = get_model(model_provider)
    args = get_args()
    tokenizer = get_tokenizer()
    if args.load is not None:
        _ = load_checkpoint(model, None, None)

    # Generate samples.
    if drmode == 1:
        f = open("questions.txt", 'r')
        if mode == 0:
            dir = "qa_345M"
        else:
            dir = "qa_345M_ip"
    if drmode == 0:
        f = open("para.txt", 'r')
        if mode == 0:
            dir = "pa_345M"
        else:
            dir = "pa_345M_ip"

    qs = f.readlines()
    question_list = []
    import json
    for i in qs:
        question_list.append(i)
    f.close()
    fdir = os.listdir()

    if not (dir in fdir):
        os.mkdir(dir)
    import random
    import jsonlines
    while True:

        q = random.choice(question_list)
        lists = os.listdir(dir)
        question = q
        lts = question[:20] + '.jsonl'
        if (lts in lists):
            continue
        #str=generate_token_tensor(str,tokenizer)

        if mode == 0:
            output_string = generate_one_text(model, tokenizer, args, question)
            print(question, output_string)

            text_dir = dir + "/"
            already = []
            with jsonlines.open(text_dir + question[:20] + '.jsonl',
                                mode='w') as writer:

                otc = {}
                otc['question'] = question
                otc['answer'] = output_string
                #print(otc)
                writer.write(otc)
        else:
            output_string, output_scores = generate_string(
                model, tokenizer, args, question)
            ranklist = np.argsort(output_scores)
            best_score = output_scores[ranklist[0]]
            text_dir = dir + "/"
            already = []
            with jsonlines.open(text_dir + question[:20] + '.jsonl',
                                mode='w') as writer:

                otc = {}
                otc['question'] = question
                otc['answer'] = output_string[ranklist[0]]
                #print(otc)
                writer.write(otc)
示例#17
0
def setup_model_and_optimizer(model_provider_func):
    """Setup model and optimizer."""
    args = get_args()

    model = get_model(model_provider_func)
    optimizer, param_groups = get_optimizer(model)
    lr_scheduler = get_learning_rate_scheduler(optimizer)

    # Determine if deepspeed config is JSON or filepath.
    # If JSON then directly load it
    deepspeed_conf = None
    if hasattr(args, 'deepspeed_config'):
        if not os.path.exists(args.deepspeed_config):
            # If its not a path trying parsing as a JSON string
            deepspeed_json_conf = args.deepspeed_config
            if len(deepspeed_json_conf) > 2 and deepspeed_json_conf[
                    0] == "'" and deepspeed_json_conf[-1] == "'":
                deepspeed_json_conf = deepspeed_json_conf[
                    1:-1]  # Remove shell quotes
            try:
                deepspeed_conf = json.loads(deepspeed_json_conf)
                args.deepspeed_config = None  # Pass directy as dictionary to deepspeed
            except JSONDecodeError:
                # Not a path or a string
                raise ValueError(
                    f'The parameter `deepspeed_config` is neither a file path that exists or a JSON string:'
                    f' {args.deepspeed_config}')

    if args.deepspeed:
        print_rank_0("DeepSpeed is enabled.")

        model, optimizer, _, lr_scheduler = deepspeed.initialize(
            model=model,
            optimizer=optimizer,
            args=args,
            lr_scheduler=lr_scheduler,
            mpu=mpu if args.pipe_parallel_size == 0 else None,
            dist_init_required=False,
            model_parameters=param_groups if optimizer is None else None,
            config_params=deepspeed_conf,
        )

        model.total_params = get_total_params(model.module)
        print_rank_0(f' > total params: {"{:,}".format(model.total_params)}')

        if args.pipe_parallel_size > 0:
            model.set_batch_fn(model.module._megatron_batch_fn)
    else:
        raise ValueError("Must be using deepspeed to run neox")

    if args.load is not None:
        args.iteration = load_checkpoint(model, optimizer, lr_scheduler)
    else:
        args.iteration = 0

    # get model without FP16 and/or TorchDDP wrappers
    unwrapped_model = model
    while hasattr(unwrapped_model, 'module'):
        unwrapped_model = unwrapped_model.module

    return model, optimizer, lr_scheduler
示例#18
0
                        args_defaults={
                            'tokenizer_type': 'GPT2BPETokenizer',
                            'no_load_rng': True,
                            'no_load_optim': True
                        })

    args = get_args()
    if args.num_layers_per_virtual_pipeline_stage is not None:
        print(
            "Interleaved pipeline schedule is not yet supported for text generation."
        )
        exit()
    # Set up model and load checkpoint
    model = get_model(model_provider, wrap_with_ddp=False)

    if args.load is not None:
        _ = load_checkpoint(model, None, None)

    assert len(model) == 1, "Above condition should have caught this"
    model = model[0]
    if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank(
    ) == 0:
        server = MegatronServer(model)
        server.run("0.0.0.0")

    while True:
        choice = torch.cuda.LongTensor(1)
        torch.distributed.broadcast(choice, 0)
        if choice[0].item() == 0:
            generate_and_post_process(model)