def sentenceTransformers(self, tokens, preprocess_obj, batch_size, num_epoch): model = SentenceTransformer('distilbert-base-nli-mean-tokens') word_embedding_model = model._first_module() train_dataloader = DataLoader(self.train_examples, shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model) print(tokens) word_embedding_model.tokenizer.add_tokens(list(tokens), special_tokens=True) word_embedding_model.auto_model.resize_token_embeddings( len(word_embedding_model.tokenizer)) model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epoch, warmup_steps=100, output_path=os.path.join(os.getcwd(), "bureau/models/" + "ST"))
# Teacher Model: Model we want to distill to a smaller model teacher_model_name = 'stsb-roberta-base-v2' teacher_model = SentenceTransformer(teacher_model_name) output_path = "output/model-distillation-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") use_layer_reduction = True #There are two options to create a light and fast student model: if use_layer_reduction: # 1) Create a smaller student model by using only some of the teacher layers student_model = SentenceTransformer(teacher_model_name) # Get the transformer model auto_model = student_model._first_module().auto_model # Which layers to keep from the teacher model. We equally spread the layers to keep over the original teacher #layers_to_keep = [5] #layers_to_keep = [3, 7] #layers_to_keep = [3, 7, 11] layers_to_keep = [1, 4, 7, 10] #Keep 4 layers from the teacher #layers_to_keep = [0, 2, 4, 6, 8, 10] #layers_to_keep = [0, 1, 3, 4, 6, 7, 9, 10] logging.info("Remove layers from student. Only keep these layers: {}".format(layers_to_keep)) new_layers = torch.nn.ModuleList([layer_module for i, layer_module in enumerate(auto_model.encoder.layer) if i in layers_to_keep]) auto_model.encoder.layer = new_layers auto_model.config.num_hidden_layers = len(layers_to_keep) else: # 2) The other option is to train a small model like TinyBERT to imitate the teacher.
logging.info("Use identifier: {}".format(use_identifier)) if use_identifier: qry_idt = "[QRY] " doc_idt = "[DOC] " if base_model.startswith('cnt_training'): model = SentenceTransformer( '../output/cnt_training_microsoft_MiniLM-L12-H384-L6-mined_hard_neg-mean-pooling-no_identifier-epoch10-batchsize100-2021-04-09_22-25-20' ) model.max_seq_length = 300 ############# Remove layers if False: auto_model = model._first_module().auto_model layers_to_keep = [0, 2, 4, 6, 8, 10] print("Reduce model to {} layers".format(len(layers_to_keep))) new_layers = torch.nn.ModuleList([ layer_module for i, layer_module in enumerate(auto_model.encoder.layer) if i in layers_to_keep ]) auto_model.encoder.layer = new_layers auto_model.config.num_hidden_layers = len(layers_to_keep) ################### else: word_embedding_model = models.Transformer(base_model, max_seq_length=350)
if __name__ == '__main__': device = 'cuda' if os.path.isdir('/media/palm/BiggerData/dictionaries/'): root_data = '/media/palm/BiggerData/dictionaries/' elif os.path.isdir('/home/palm/PycharmProjects/cp/cp10-work/'): root_data = '/home/palm/PycharmProjects/cp' elif os.path.isdir('/home/palm/PycharmProjects/nlp/cp10-work'): root_data = '/home/palm/PycharmProjects/nlp/' else: raise ValueError('Well, something\'s wrong here') eng_sm = SentenceTransformer(os.path.join(root_data, 'cp10-work')) eng_sm.requires_grad_(False) eng_sm.train(False) embeddings = copy.deepcopy( eng_sm._first_module().auto_model.embeddings).to(device) embeddings.requires_grad_(True) embeddings.train(True) dataset = SentenceTokenized(eng_sm.tokenizer, 'first', language='eng', true_only=True) model = AEPretrainedEmbedding(dataset.vocab_size, embeddings) model.to(device) optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6, betas=(0.7, 0.999)) schedule = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2,