def test_electra():
    generator = ReformerLM(num_tokens=20000,
                           dim=512,
                           depth=1,
                           max_seq_len=1024)

    discriminator = ReformerLM(num_tokens=20000,
                               dim=512,
                               depth=2,
                               max_seq_len=1024)

    generator.token_emb = discriminator.token_emb
    generator.pos_emb = discriminator.pos_emb

    trainer = Electra(generator,
                      discriminator,
                      num_tokens=20000,
                      discr_dim=512,
                      discr_layer='reformer',
                      pad_token_id=1,
                      mask_ignore_token_ids=[2, 3])

    data = torch.randint(0, 20000, (1, 1024))
    results = trainer(data)
    results.loss.backward()
def test_electra_without_magic():
    generator = ReformerLM(num_tokens=20000,
                           dim=512,
                           depth=1,
                           max_seq_len=1024)

    discriminator = ReformerLM(num_tokens=20000,
                               dim=512,
                               depth=2,
                               max_seq_len=1024,
                               return_embeddings=True)

    generator.token_emb = discriminator.token_emb
    generator.pos_emb = discriminator.pos_emb

    discriminator_with_adapter = nn.Sequential(discriminator,
                                               nn.Linear(512, 1), nn.Sigmoid())

    trainer = Electra(generator,
                      discriminator_with_adapter,
                      num_tokens=20000,
                      pad_token_id=1,
                      mask_ignore_token_ids=[2, 3])

    data = torch.randint(0, 20000, (1, 1024))
    results = trainer(data)
    results.loss.backward()
Exemplo n.º 3
0
def main():
    torch.manual_seed(9)
    # 1. Config
    train_config, gen_config, disc_config = ElectraConfig(
        config_path='../config/electra/electra-train.json').get_config()

    # 2. Tokenizer
    tokenizer = BertTokenizer(vocab_file=train_config.vocab_path,
                              do_lower_case=False)

    # 3. Dataset
    dataset = ElectraDataset(tokenizer,
                             train_config.max_len,
                             data_path=train_config.data_path)

    # 4. Electra Model
    # 4.1. instantiate the generator and discriminator,
    # making sure that the generator is roughly a quarter to a half of the size of the discriminator
    # 제너레이터의 크기는 디스크리미네이터의 1/4~ 1/2 크기로
    # Generator
    generator = ReformerLM(
        num_tokens=tokenizer.vocab_size,
        emb_dim=gen_config.emb_dim,
        dim=gen_config.emb_dim,  # smaller hidden dimension
        heads=gen_config.heads,  # less heads
        ff_mult=gen_config.
        ff_mult,  # smaller feed forward intermediate dimension
        dim_head=gen_config.dim_head,
        depth=gen_config.depth,
        max_seq_len=train_config.max_len)

    discriminator = ReformerLM(
        num_tokens=tokenizer.vocab_size,
        emb_dim=disc_config.emb_dim,
        dim=disc_config.dim,
        dim_head=disc_config.dim_head,
        heads=disc_config.heads,
        depth=disc_config.depth,
        ff_mult=disc_config.ff_mult,
        max_seq_len=train_config.max_len,
        return_embeddings=True,
    )
    # 4.2 weight tie the token and positional embeddings of generator and discriminator
    # 제너레이터와 디스크리미네이터의 토큰, 포지션 임베딩을 공유한다(tie).
    generator.token_emb = discriminator.token_emb
    generator.pos_emb = discriminator.pos_emb
    # weight tie any other embeddings if available, token type embeddings, etc.
    # 다른 임베딩 웨이트도 있다면 공유 필요.

    # 4.3 instantiate electra
    # 엘렉트라 모델 초기화
    discriminator_with_adapter = nn.Sequential(discriminator,
                                               nn.Linear(disc_config.dim, 1))

    model = Electra(
        generator,
        discriminator_with_adapter,
        mask_token_id=tokenizer.
        mask_token_id,  # the token id reserved for masking
        pad_token_id=tokenizer.pad_token_id,  # the token id for padding
        mask_prob=0.15,  # masking probability for masked language modeling
        mask_ignore_token_ids=tokenizer.
        all_special_ids  # ids of tokens to ignore for mask modeling ex. (cls, sep)
    )

    trainer = ElectraTrainer(dataset,
                             model,
                             tokenizer,
                             train_config.max_len,
                             checkpoint_path=train_config.checkpoint_path,
                             model_name=train_config.model_name,
                             train_batch_size=train_config.batch_size,
                             eval_batch_size=train_config.batch_size)
    train_dataloader, eval_dataloader = trainer.build_dataloaders(
        train_test_split=0.1)

    model = trainer.train(
        epochs=train_config.epochs,
        train_dataloader=train_dataloader,
        eval_dataloader=eval_dataloader,
        log_steps=train_config.log_steps,
        ckpt_steps=train_config.ckpt_steps,
        gradient_accumulation_steps=train_config.gradient_accumulation_steps)
    depth=12,
    max_seq_len=1024)

discriminator = ReformerLM(num_tokens=20000,
                           emb_dim=128,
                           dim=1024,
                           dim_head=64,
                           heads=16,
                           depth=12,
                           ff_mult=4,
                           max_seq_len=1024)

# (2) weight tie the token and positional embeddings of generator and discriminator

generator.token_emb = discriminator.token_emb
generator.pos_emb = discriminator.pos_emb
# weight tie any other embeddings if available, token type embeddings, etc.

# (3) instantiate electra

trainer = Electra(
    generator,
    discriminator,
    discr_dim=
    1024,  # the embedding dimension of the discriminator, discriminator의 임베딩 차원
    discr_layer=
    'reformer',  # the layer name in the discriminator, whose output would be used for predicting token is still the same or replaced
    mask_token_id=2,  # the token id reserved for masking, 마스크 토큰 인덱스
    pad_token_id=0,  # the token id for padding, 패딩 토큰 인덱스
    mask_prob=0.15,  # masking probability for masked language modeling,
    mask_ignore_token_ids=[
Exemplo n.º 5
0
def main(input, output):
    max_seq_length = 512
    doc_stride = 64
    max_query_length = 64
    batch_size = 16
    n_best_size = 20
    max_answer_length = 30
    seed = 42
    fp16 = False

    # device = torch.device("cpu")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    n_gpu = torch.cuda.device_count()
    logger.info("device: {} n_gpu: {}".format(device, n_gpu))

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    # 1. Config
    train_config, gen_config, disc_config = ElectraConfig(config_path=CONFIG_PATH).get_config()

    # 2. Tokenizer
    tokenizer = BertTokenizer(vocab_file=train_config.vocab_path, do_lower_case=False)

    # 3. Generator
    generator = ReformerLM(
      num_tokens=tokenizer.vocab_size,
      emb_dim=gen_config.emb_dim,
      dim=gen_config.emb_dim,  # smaller hidden dimension
      heads=gen_config.heads,  # less heads
      ff_mult=gen_config.ff_mult,  # smaller feed forward intermediate dimension
      dim_head=gen_config.dim_head,
      depth=gen_config.depth,
      max_seq_len=train_config.max_len
    )
    # 4. Discriminator
    discriminator = ReformerLM(
      num_tokens=tokenizer.vocab_size,
      emb_dim=disc_config.emb_dim,
      dim=disc_config.dim,
      dim_head=disc_config.dim_head,
      heads=disc_config.heads,
      depth=disc_config.depth,
      ff_mult=disc_config.ff_mult,
      max_seq_len=train_config.max_len,
      return_embeddings=True,
    )
    # 4.2 weight tie the token and positional embeddings of generator and discriminator
    # 제너레이터와 디스크리미네이터의 토큰, 포지션 임베딩을 공유한다(tie).
    generator.token_emb = discriminator.token_emb
    generator.pos_emb = discriminator.pos_emb
    # weight tie any other embeddings if available, token type embeddings, etc.
    # 다른 임베딩 웨이트도 있다면 공유 필요.

    # 4.3 instantiate electra
    # 엘렉트라 모델 초기화
    discriminator_with_adapter = nn.Sequential(discriminator, nn.Linear(disc_config.dim, 1))

    electra = Electra(
      generator,
      discriminator_with_adapter,
      mask_token_id=tokenizer.mask_token_id,  # the token id reserved for masking
      pad_token_id=tokenizer.pad_token_id,  # the token id for padding
      mask_prob=0.15,  # masking probability for masked language modeling
      mask_ignore_token_ids=tokenizer.all_special_ids  # ids of tokens to ignore for mask modeling ex. (cls, sep)
    )
    # electra.load_state_dict(torch.load(train_config.checkpoint_path, map_location=device),strict=False)

    electra_discriminator = electra.discriminator[0]

    model = DiscriminatorMRCModel(discriminator=electra_discriminator, dim=disc_config.dim)

    eval_examples = read_squad_examples(input_file=input, is_training=False, version_2_with_negative=False)
    eval_features = convert_examples_to_features(
      examples=eval_examples,
      tokenizer=tokenizer,
      max_seq_length=max_seq_length,
      doc_stride=doc_stride,
      max_query_length=max_query_length,
      is_training=False)

    if fp16 is True:
      model.half()
    model.load_state_dict(torch.load(CHK_PATH, map_location=device))
    model.to(device)
    logger.info("***** Running training *****")
    logger.info("  Num orig examples = %d", len(eval_examples))
    logger.info("  Num split examples = %d", len(eval_features))
    logger.info("  Batch size = %d", batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)

    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size)

    model.eval()
    all_results = []
    logger.info("Start evaluating")
    for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
            batch_start_logits, batch_end_logits = model(input_ids)
        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(RawResult(unique_id=unique_id,
                                         start_logits=start_logits,
                                         end_logits=end_logits))
    output_nbest_file = os.path.join("nbest_predictions.json")
    write_predictions(eval_examples, eval_features, all_results,
                        n_best_size, max_answer_length,
                        False, output, output_nbest_file,
                        None, False, False, 0.0)