Exemplo n.º 1
0
    def sentenceTransformers(self, tokens, preprocess_obj, batch_size,
                             num_epoch):

        model = SentenceTransformer('distilbert-base-nli-mean-tokens')
        word_embedding_model = model._first_module()
        train_dataloader = DataLoader(self.train_examples,
                                      shuffle=True,
                                      batch_size=batch_size)
        train_loss = losses.CosineSimilarityLoss(model)
        print(tokens)
        word_embedding_model.tokenizer.add_tokens(list(tokens),
                                                  special_tokens=True)
        word_embedding_model.auto_model.resize_token_embeddings(
            len(word_embedding_model.tokenizer))
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  epochs=num_epoch,
                  warmup_steps=100,
                  output_path=os.path.join(os.getcwd(),
                                           "bureau/models/" + "ST"))
# Teacher Model: Model we want to distill to a smaller model
teacher_model_name = 'stsb-roberta-base-v2'
teacher_model = SentenceTransformer(teacher_model_name)

output_path = "output/model-distillation-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


use_layer_reduction = True

#There are two options to create a light and fast student model:
if use_layer_reduction:
    # 1) Create a smaller student model by using only some of the teacher layers
    student_model = SentenceTransformer(teacher_model_name)

    # Get the transformer model
    auto_model = student_model._first_module().auto_model

    # Which layers to keep from the teacher model. We equally spread the layers to keep over the original teacher
    #layers_to_keep = [5]
    #layers_to_keep = [3, 7]
    #layers_to_keep = [3, 7, 11]
    layers_to_keep = [1, 4, 7, 10]          #Keep 4 layers from the teacher
    #layers_to_keep = [0, 2, 4, 6, 8, 10]
    #layers_to_keep = [0, 1, 3, 4, 6, 7, 9, 10]

    logging.info("Remove layers from student. Only keep these layers: {}".format(layers_to_keep))
    new_layers = torch.nn.ModuleList([layer_module for i, layer_module in enumerate(auto_model.encoder.layer) if i in layers_to_keep])
    auto_model.encoder.layer = new_layers
    auto_model.config.num_hidden_layers = len(layers_to_keep)
else:
    # 2) The other option is to train a small model like TinyBERT to imitate the teacher.
Exemplo n.º 3
0
    logging.info("Use identifier: {}".format(use_identifier))

    if use_identifier:
        qry_idt = "[QRY] "
        doc_idt = "[DOC] "

    if base_model.startswith('cnt_training'):
        model = SentenceTransformer(
            '../output/cnt_training_microsoft_MiniLM-L12-H384-L6-mined_hard_neg-mean-pooling-no_identifier-epoch10-batchsize100-2021-04-09_22-25-20'
        )
        model.max_seq_length = 300

        ############# Remove layers
        if False:
            auto_model = model._first_module().auto_model
            layers_to_keep = [0, 2, 4, 6, 8, 10]
            print("Reduce model to {} layers".format(len(layers_to_keep)))
            new_layers = torch.nn.ModuleList([
                layer_module
                for i, layer_module in enumerate(auto_model.encoder.layer)
                if i in layers_to_keep
            ])
            auto_model.encoder.layer = new_layers
            auto_model.config.num_hidden_layers = len(layers_to_keep)

    ###################
    else:
        word_embedding_model = models.Transformer(base_model,
                                                  max_seq_length=350)
if __name__ == '__main__':
    device = 'cuda'
    if os.path.isdir('/media/palm/BiggerData/dictionaries/'):
        root_data = '/media/palm/BiggerData/dictionaries/'
    elif os.path.isdir('/home/palm/PycharmProjects/cp/cp10-work/'):
        root_data = '/home/palm/PycharmProjects/cp'
    elif os.path.isdir('/home/palm/PycharmProjects/nlp/cp10-work'):
        root_data = '/home/palm/PycharmProjects/nlp/'
    else:
        raise ValueError('Well, something\'s wrong here')
    eng_sm = SentenceTransformer(os.path.join(root_data, 'cp10-work'))
    eng_sm.requires_grad_(False)
    eng_sm.train(False)

    embeddings = copy.deepcopy(
        eng_sm._first_module().auto_model.embeddings).to(device)
    embeddings.requires_grad_(True)
    embeddings.train(True)
    dataset = SentenceTokenized(eng_sm.tokenizer,
                                'first',
                                language='eng',
                                true_only=True)

    model = AEPretrainedEmbedding(dataset.vocab_size, embeddings)
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(),
                                  lr=1e-6,
                                  betas=(0.7, 0.999))
    schedule = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                          patience=2,