예제 #1
0
    def __init__(
        self,
        pretrained_bert_model,
        language,
        name,
        prediction_type,
        output_hidden_states,
        output_attentions,
        attention_length_before=1,
        attention_length_after=1,
        config_path=None,
        max_length=512,
        number_of_sentence=1,
        number_of_sentence_before=0,
        number_of_sentence_after=0,
        seed=1111,
        hidden_dropout_prob=0.,
        attention_probs_dropout_prob=0.,
        stop_attention_at_sent_before=None,
        stop_attention_before_sent=0,
    ):
        super(BertExtractor, self).__init__()
        # Load pre-trained model tokenizer (vocabulary)
        # Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization.
        if config_path is None:
            configuration = BertConfig()
            configuration.hidden_dropout_prob = hidden_dropout_prob
            configuration.attention_probs_dropout_prob = attention_probs_dropout_prob
            configuration.output_hidden_states = output_hidden_states
            configuration.output_attentions = output_attentions
            self.model = BertModel.from_pretrained(
                pretrained_bert_model,
                config=configuration)  #, config=configuration
        else:
            self.model = BertModel.from_pretrained(
                pretrained_bert_model)  #, config=configuration
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_model)

        self.language = language
        self.attention_length_before = attention_length_before
        self.attention_length_after = attention_length_after
        self.pretrained_bert_model = pretrained_bert_model
        self.NUM_HIDDEN_LAYERS = self.model.config.num_hidden_layers
        self.FEATURE_COUNT = self.model.config.hidden_size
        self.NUM_ATTENTION_HEADS = self.model.config.num_attention_heads
        self.name = name
        self.config = {
            'max_length':
            max_length,
            'seed':
            seed,
            'number_of_sentence':
            number_of_sentence,
            'number_of_sentence_before':
            number_of_sentence_before,
            'number_of_sentence_after':
            number_of_sentence_after,
            'attention_length_before':
            attention_length_before,
            'attention_length_after':
            attention_length_after,
            'stop_attention_at_sent_before':
            stop_attention_at_sent_before,
            'stop_attention_before_sent':
            stop_attention_before_sent,
            'output_hidden_states':
            output_hidden_states,
            'output_attentions':
            output_attentions,
            'model_type':
            'bert',
            'hidden_size':
            self.model.config.hidden_size,
            'hidden_act':
            self.model.config.hidden_act,
            'initializer_range':
            self.model.config.initializer_range,
            'vocab_size':
            self.model.config.vocab_size,
            'hidden_dropout_prob':
            self.model.config.hidden_dropout_prob,
            'num_attention_heads':
            self.model.config.num_attention_heads,
            'type_vocab_size':
            self.model.config.type_vocab_size,
            'max_position_embeddings':
            self.model.config.max_position_embeddings,
            'num_hidden_layers':
            self.model.config.num_hidden_layers,
            'intermediate_size':
            self.model.config.intermediate_size,
            'attention_probs_dropout_prob':
            self.model.config.attention_probs_dropout_prob
        }
        if config_path is not None:
            with open(config_path, 'r') as f:
                self.config.update(json.load(f))

        self.prediction_type = prediction_type  # ['sentence', 'token-level']
예제 #2
0
def test_ipu_cpu_match(recompute_checkpoint, embedding_serialization):
    """
    Test that the BERT model ran on IPU approximately matches that same
    model ran on the CPU.
    """
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    # Config
    args = """
    --config unit_test
    --lr-schedule constant
    --layers-per-ipu 0 3
    --vocab-size 30400
    --batch-size 10
    --batches-per-step 1
    --gradient-accumulation 10
    --enable-half-partials False
    --optimizer AdamW
    --learning-rate 0.001
    """.split()
    config = BertConfig(**(vars(parse_bert_args(args))))
    config.hidden_dropout_prob = 0.0
    config.attention_probs_dropout_prob = 0.0
    config.recompute_checkpoint_every_layer = recompute_checkpoint
    config.embedding_serialization = embedding_serialization

    # Models and options
    opts = get_options(config)
    opts.anchorMode(poptorch.AnchorMode.Final)
    model_cpu = PipelinedBertWithLoss(config).train()
    model_ipu = PipelinedBertWithLoss(config).train()
    model_ipu.load_state_dict(model_cpu.state_dict())

    # Check that copy was successful
    assert model_ipu is not model_cpu
    assert all([(a == b).all() for a, b in zip(
        model_cpu.parameters(), model_ipu.parameters())]) is True

    optimizer_cpu = torch.optim.AdamW(model_cpu.parameters(), lr=0.001)
    optimizer_ipu = poptorch.optim.AdamW(model_ipu.parameters(), lr=0.001, loss_scaling=1.0)
    poptorch_model = poptorch.trainingModel(model_ipu, opts, optimizer=optimizer_ipu)

    # Input
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer("Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute", return_tensors="pt")
    inputs['labels'] = torch.randint(0, config.vocab_size, [1, config.mask_tokens], dtype=torch.long)
    inputs['next_sentence_label'] = torch.randint(0, 1, [1], dtype=torch.long)
    inputs['masked_lm_positions'] = torch.randint(0, config.sequence_length, [1, config.mask_tokens], dtype=torch.long)

    batch_size = config.batch_size

    batch = (inputs['input_ids'].repeat(batch_size, 1),
             inputs['attention_mask'].repeat(batch_size, 1),
             inputs['token_type_ids'].repeat(batch_size, 1),
             inputs['masked_lm_positions'].repeat(batch_size, 1),
             inputs['labels'].repeat(batch_size, 1),
             inputs['next_sentence_label'].repeat(batch_size, 1))

    batch_cpu = (inputs['input_ids'].repeat(1, 1),
                 inputs['attention_mask'].repeat(1, 1),
                 inputs['token_type_ids'].repeat(1, 1),
                 inputs['masked_lm_positions'].repeat(1, 1),
                 inputs['labels'].repeat(1, 1),
                 inputs['next_sentence_label'].repeat(1, 1))

    # Training Loop
    for step in range(10):
        # Step CPU model
        optimizer_cpu.zero_grad()
        for b in range(batch_size):
            cpu_output = model_cpu(*batch_cpu)
            cpu_loss = cpu_output[0]
            cpu_loss.div(batch_size).backward()
        optimizer_cpu.step()

        # Step IPU Model
        ipu_output = poptorch_model(*batch)
        ipu_loss = ipu_output[0]

        with torch.no_grad():
            print(f"CPU Loss: {cpu_loss}, IPU Loss: {ipu_loss}")
            # Check the losses are approximately equal
            assert np.allclose(cpu_loss.numpy(), ipu_loss.numpy(), atol=1e-6)