Exemplo n.º 1
0
 def create_and_check_bert_for_pretraining(
     self,
     config,
     input_ids,
     token_type_ids,
     input_mask,
     sequence_labels,
     token_labels,
     choice_labels,
 ):
     model = BertForPreTraining(config=config)
     model.eval()
     loss, prediction_scores, seq_relationship_score = model(
         input_ids, token_type_ids, input_mask, token_labels,
         sequence_labels)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
         "seq_relationship_score": seq_relationship_score,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size],
     )
     self.parent.assertListEqual(
         list(result["seq_relationship_score"].size()),
         [self.batch_size, 2])
     self.check_loss_output(result)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                                     pytorch_dump_path):
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
Exemplo n.º 3
0
def convert_ckpt_compatible(ckpt_path, config_path):
    ckpt = torch.load(ckpt_path, map_location='cpu')
    keys = list(ckpt.keys())
    for key in keys:
        if 'LayerNorm' in key:
            if 'gamma' in key:
                ckpt[key.replace('gamma', 'weight')] = ckpt.pop(key)
            else:
                ckpt[key.replace('beta', 'bias')] = ckpt.pop(key)

    model_config = BertConfig.from_json_file(config_path)
    model = BertForPreTraining(model_config)
    model.load_state_dict(ckpt)
    new_ckpt = model.bert.state_dict()

    return new_ckpt
Exemplo n.º 4
0
def get_bert(bert_model,
             bert_do_lower_case,
             use_albert=False,
             use_sparse=False,
             use_electra=False):
    # Avoid a hard dependency on BERT by only importing it if it's being used
    from pytorch_transformers import BertTokenizer, BertModel, BertForPreTraining
    tokenizer = BertTokenizer.from_pretrained(bert_model,
                                              do_lower_case=bert_do_lower_case)
    bert = BertForPreTraining.from_pretrained(bert_model,
                                              use_albert=use_albert,
                                              use_sparse=use_sparse,
                                              use_electra=use_electra)
    return tokenizer, bert
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_corpus",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_corpus)
        train_dataset = BERTDataset(args.train_corpus,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.bert_model, from_tf=False)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)

        else:
            #             optimizer = BertAdam(optimizer_grouped_parameters,
            #                                  lr=args.learning_rate,
            #                                  warmup=args.warmup_proportion,
            #                                  t_total=num_train_optimization_steps)

            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              eps=1e-8)
            scheduler = WarmupLinearSchedule(
                optimizer,
                warmup_steps=0,
                t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                loss = model(input_ids, segment_ids, input_mask, lm_label_ids,
                             is_next)[0]
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
            model_to_save.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(args.output_dir)
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=
        "Extraction some layers of the full BertForPreTraining for Transfer Learned Distillation"
    )
    parser.add_argument("--bert_model", default='bert-base-uncased', type=str)
    parser.add_argument(
        "--dump_checkpoint",
        default='serialization_dir/transfer_learning_checkpoint_0247911.pth',
        type=str)
    parser.add_argument("--vocab_transform", action='store_true')
    args = parser.parse_args()

    model = BertForPreTraining.from_pretrained(args.bert_model)

    state_dict = model.state_dict()
    compressed_sd = {}

    for w in ['word_embeddings', 'position_embeddings']:
        compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
            state_dict[f'bert.embeddings.{w}.weight']
    for w in ['weight', 'bias']:
        compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
            state_dict[f'bert.embeddings.LayerNorm.{w}']

    std_idx = 0
    for teacher_idx in [0, 2, 4, 7, 9, 11]:
        for w in ['weight', 'bias']:
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
Exemplo n.º 7
0
from pytorch_transformers import (
    WEIGHTS_NAME, AdamW, WarmupLinearSchedule, BertConfig, BertForMaskedLM,
    BertTokenizer, BertForPreTraining, GPT2Config, GPT2LMHeadModel,
    GPT2Tokenizer, OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
    RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)

## extract last layer attention ??

config = BertConfig.from_pretrained('bert-base-uncased')
config.output_attentions = True
config.output_hidden_states = True

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertForPreTraining(config)
model.eval()

input_ids1 = tokenizer.encode("Hello, my dog is cute")  # Batch size 1
input_ids2 = tokenizer.encode("Hello, my dog is one")
input_ids = torch.tensor([input_ids1, input_ids2])
outputs = model(input_ids)

word_dot_distance = torch.randn(2, 1, 4, 3)  ## 2 batch
word_word_relation = torch.LongTensor(
    np.round(np.random.uniform(size=(2, 1, 4, 4), low=0, high=2)))
out = torch.gather(word_dot_distance, dim=3, index=word_word_relation)

distance_type = nn.Embedding(3, 5, padding_idx=0)
distance_type.weight
from pytorch_transformers import (
    WEIGHTS_NAME, AdamW, WarmupLinearSchedule, BertConfig, BertForMaskedLM,
    BertTokenizer, BertForPreTraining, GPT2Config, GPT2LMHeadModel,
    GPT2Tokenizer, OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
    RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)

## extract last layer attention ??

config = BertConfig.from_pretrained('bert-base-uncased')
config.output_attentions = True
config.output_hidden_states = True

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertForPreTraining(config)
model.eval()

model.resize_token_embeddings(60000)

model.bert.embeddings(torch.LongTensor(np.array([[0, 60000], [4, 50000]])))

input_ids1 = tokenizer.encode("Hello, my dog is cute")  # Batch size 1
input_ids2 = tokenizer.encode("Hello, my dog is one")
input_ids = torch.tensor([input_ids1, input_ids2])
outputs = model(input_ids)
# (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
hidden = outputs[-2]

layers = outputs[-1]  ## 12 layers
            sequence_output, pooled_output
        )

        mean_pooled_output = torch.mean(sequence_output, dim=1)
        mean_pooled_output = self.dropout(mean_pooled_output)
        logits = self.classifier(mean_pooled_output)

        outputs = (prediction_scores, seq_relationship_score, logits)
        return outputs


config = BertConfig(str(PATH_TO_CKPT_CONFIG / "config.json"))
model = BertPretrain(config, len(TARGETS))

# Prepare extended bert embedding
orig_bert = BertForPreTraining.from_pretrained("bert-base-cased")
orig_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

state_dict = orig_bert.state_dict()
del state_dict["cls.predictions.decoder.weight"], state_dict["cls.predictions.bias"]

orig_embedding = state_dict["bert.embeddings.word_embeddings.weight"]

extra_tokens = list(tokenizer.vocab.keys())[len(orig_tokenizer.vocab) :]
new_tokens_as_orig_indices = [[i] for i in range(len(orig_tokenizer.vocab))] + [
    orig_tokenizer.encode(t, add_special_tokens=False) for t in extra_tokens
]

new_embedding = torch.zeros(len(new_tokens_as_orig_indices), orig_embedding.shape[-1])
new_embedding.normal_(mean=0.0, std=0.02)