예제 #1
0
    def model_provider():
        """Build the model."""

        if eval_metric == 'loss':
            parallel_output = True
        elif eval_metric == 'accuracy':
            parallel_output = False
        else:
            raise NotImplementedError('output type for {} evaluation metric '
                                      'is not supported.'.format(eval_metric))

        print_rank_0('building GPT2 model ...')
        if mpu.get_pipeline_model_parallel_world_size() > 1:
            # Determine model based on position of stage in pipeline.
            if mpu.is_pipeline_first_stage():
                model = GPT2ModelFirstStage(num_tokentypes=0)
            elif mpu.is_pipeline_last_stage():
                model = GPT2ModelLastStage(parallel_output=parallel_output,
                                           num_tokentypes=0)
            else:
                model = GPT2ModelIntermediateStage(num_tokentypes=0)
        else:
            model = GPT2Model(num_tokentypes=0,
                              parallel_output=parallel_output)

        return model
예제 #2
0
def get_gpt2_model(args_others, mp_size=1):
    from megatron.model import GPT2Model
    from megatron.initialize import initialize_megatron

    args_defaults = {
        'vocab_file': get_test_path('gpt2-vocab.json'),
        'merge_file': get_test_path('gpt2-merges.txt'),
        'tokenizer_type': 'GPT2BPETokenizer',
    }

    args_defaults.update(args_others)

    # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
    sys.argv.extend([
        '--model-parallel-size',
        str(mp_size), '--make-vocab-size-divisible-by',
        str(1)
    ])

    initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
    model = GPT2Model(num_tokentypes=0, parallel_output=False)
    model.cuda()
    from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
    from megatron import mpu
    i = torch.cuda.current_device()
    model = torchDDP(model,
                     device_ids=[i],
                     output_device=i,
                     process_group=mpu.get_data_parallel_group())

    return model
예제 #3
0
def model_provider():
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    model = GPT2Model(num_tokentypes=0, parallel_output=False)

    return model
예제 #4
0
def model_provider():
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
                             remote_device=get_args().remote_device,
                             enabled=get_args().zero_stage == 3):
        model = GPT2Model(num_tokentypes=0, parallel_output=True)

    return model
예제 #5
0
파일: evaluate.py 프로젝트: yf1291/nlp3
    def model_provider():
        """Build the model."""

        if eval_metric == 'loss':
            parallel_output = True
        elif eval_metric == 'accuracy':
            parallel_output = False
        else:
            raise NotImplementedError('output type for {} evaluation metric '
                                      'is not supported.'.format(eval_metric))

        print_rank_0('building GPT2 model ...')
        model = GPT2Model(num_tokentypes=0, parallel_output=parallel_output)

        return model
예제 #6
0
def model_provider():
    """Build the model."""

    args = get_args()

    print_rank_0('building GPT2 model ...')
    if args.pipe_parallel_size == 0:
        model = GPT2Model(num_tokentypes=0, parallel_output=True)
    else:
        model = GPT2ModelPipe(num_tokentypes=0,
                              parallel_output=True,
                              topology=mpu.get_topology())
        # This is a hack to give us a reference to get_batch_pipe from within training.py
        # We need to call model.set_batch_fn after deepspeed.initialize
        model._megatron_batch_fn = get_batch_pipe

    return model
예제 #7
0
def model_provider():
    """Build the model."""

    args = get_args()

    print_rank_0('building GPT2 model ...')
    if args.pipe_parallel_size == 0:
        model = GPT2Model(num_tokentypes=0, parallel_output=True)
    else:
        model = GPT2ModelPipe(num_tokentypes=0,
                              parallel_output=True,
                              topology=mpu.get_topology())
        # This is a hack to give us a reference to get_batch_pipe from within training.py
        # We need to call model.set_batch_fn after deepspeed.initialize
        model._megatron_batch_fn = get_batch_pipe

    ## Wandb
    use_wandb = get_wandb_api_key() is not None
    set_use_wandb(use_wandb)
    args_dict = vars(args)
    if use_wandb:
        # only display system stats from one worker per machine
        wandb_settings = wandb.Settings() if is_local_main(
        ) else wandb.Settings(_disable_stats=True)
        group_name = args_dict.get('wandb_group')
        name = f'{socket.gethostname()}-{local_rank()}' if group_name else None

        try:
            wandb.init(project="neox",
                       group=group_name,
                       name=name,
                       save_code=False,
                       force=False,
                       entity=args_dict.get('wandb_team'),
                       settings=wandb_settings)
        except UsageError as e:
            set_use_wandb(False)
            print(e)
            print(
                'Skipping wandb. Execute `wandb login` on local or main node machine to enable.'
            )

    if use_wandb:
        wandb.config.update(args_dict)

    return model
예제 #8
0
def model_provider():
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    see_memory_usage(f"Before Building Model", force=True)
    with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
                             remote_device=get_args().remote_device,
                             deepspeed_config=get_args().deepspeed_config,
                             enabled=get_args().zero_stage == 3):
        model = GPT2Model(num_tokentypes=0, parallel_output=True)
    see_memory_usage(f"After Building Model", force=True)

    if mpu.get_data_parallel_rank() == 0:
        billion_params = get_parameters_in_billions(model)
        print(
            f' > number of parameters on model parallel rank {mpu.get_model_parallel_rank()}\
            {round(billion_params, 3)} Billion',
            flush=True)

    return model