Python GPT2Model.GPT2Model примеры использования

Язык программирования: Python

Пространство имен/Пакет: model

Класс/Тип: GPT2Model

Метод/Функция: GPT2Model

Примеров на hotexamples.com: 6

Python GPT2Model.GPT2Model - 6 примеров найдено. Это лучшие примеры Python кода для model.GPT2Model.GPT2Model, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GPT2Model(6)

Основные методы

GPT2Model (6)

Пример #1

Показать файл

Файл: generate_samples.py Проект: rahul-art/DeepSpeedExamples

def get_model(args):
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    model = GPT2Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=False)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    model = DDP(model)

    return model

Пример #2

Показать файл

def get_model(deepspeed_config_path):
    num_local_heads = 16
    sparse_mode = 'alternating'
    deepspeed_sparsity_config = get_sparse_attention_config(
        deepspeed_config_path, num_local_heads)
    if deepspeed_sparsity_config is not None:
        logger.info(f"Use sparse attention with mode {sparse_mode}")
    else:
        logger.info(f"Use dense attention")
    model = GPT2Model(num_layers=24,
                      vocab_size=50264,
                      hidden_size=2048,
                      num_attention_heads=num_local_heads,
                      embedding_dropout_prob=0.1,
                      attention_dropout_prob=0.1,
                      output_dropout_prob=0.1,
                      max_sequence_length=2048,
                      checkpoint_activations=False,
                      checkpoint_num_layers=1,
                      parallel_output=False,
                      deepspeed_sparsity_config=deepspeed_sparsity_config,
                      sparse_mode=sparse_mode)
    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    model = FP16_Module(model)

    return model

Пример #3

Показать файл

Файл: pretrain_gpt2.py Проект: jsyzc2019/Chinese-Transformer-XL

def get_model(args):
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    model = GPT2Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      max_memory_length=args.mem_length,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True,
                      relative_encoding=args.transformer_xl)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if hasattr(args, "deepspeed") and args.deepspeed and args.fp16:
        model.half()

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if not args.deepspeed:
        if USE_TORCH_DDP:
            i = torch.cuda.current_device()
            model = DDP(model,
                        device_ids=[i],
                        output_device=i,
                        process_group=mpu.get_data_parallel_group())
        else:
            model = DDP(model)

    return model

Пример #4

Показать файл

Файл: pretrain_gpt2.py Проект: Omhet/ruGPT2

def get_model(args):
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    model = GPT2Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if args.DDP_impl == 'torch':
        i = torch.cuda.current_device()
        args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel
        model = args.DDP_type(model,
                              device_ids=[i],
                              output_device=i,
                              process_group=mpu.get_data_parallel_group())
    elif args.DDP_impl == 'local':
        args.DDP_type = LocalDDP
        model = args.DDP_type(model)
    else:
        print_rank_0('Unknown DDP implementation specified: {}. '
                     'Exiting.'.format(args.DDP_impl))
        exit()

    return model

Пример #5

Показать файл

def get_model(args, config, do_fp16=False):
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    model = GPT2Model(**config,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if args.deepspeed and do_fp16:
        model.half()

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if do_fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if USE_TORCH_DDP:
        i = torch.cuda.current_device()
        model = DDP(model,
                    device_ids=[i],
                    output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        model = DDP(model)

    return model

Пример #6

Показать файл

                    default=100)
parser.add_argument('--print_every',
                    help='Number of Steps for Training',
                    default=10)
parser.add_argument('--sample_every',
                    help='Number of Steps for Training',
                    default=200)
parser.add_argument('--save_every',
                    help='Number of Steps for Training',
                    default=500)
parser.add_argument('--model_type',
                    help="Which model to use for finetuning",
                    default='124M')

args = parser.parse_args()
gpt2_model = GPT2Model(model_type=args.model_type)

# Start finetuning on first chunk
gpt2_model.fit(input_path='train_1.txt',
               print_every=int(args.print_every),
               sample_every=int(args.sample_every),
               save_every=int(args.save_every),
               num_steps=int(args.nsteps))

# Load the tuned model and finetune on remaining chunks
for i in range(2, 10):
    gpt2_model.fit(input_path='train_' + str(i) + '.txt',
                   overwrite=True,
                   restore_from='latest',
                   print_every=int(args.print_every),
                   sample_every=int(args.sample_every),