for dev_file in dev_files: logging.info("Create evaluator for " + dev_file) src_sentences = [] trg_sentences = [] with gzip.open(dev_file, 'rt', encoding='utf8') as fIn: for line in fIn: splits = line.strip().split('\t') if splits[0] != "" and splits[1] != "": src_sentences.append(splits[0]) trg_sentences.append(splits[1]) #Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, name=os.path.basename(dev_file), teacher_model=teacher_model, batch_size=inference_batch_size) evaluators.append(dev_mse) # TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences dev_trans_acc = evaluation.TranslationEvaluator( src_sentences, trg_sentences, name=os.path.basename(dev_file), batch_size=inference_batch_size) evaluators.append(dev_trans_acc) ##### Read cross-lingual Semantic Textual Similarity (STS) data #### all_languages = list(set(list(source_languages) + list(target_languages))) sts_data = {}
logging.info("Read STS2017.en-de dataset") evaluators = [] sts_reader = readers.STSDataReader('../datasets/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2) dev_data = SentencesDataset(examples=sts_reader.get_examples('STS2017.en-de.txt.gz'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='STS2017.en-de') evaluators.append(evaluator_sts) # Use XLNI.en-de dataset with MSE evaluation logging.info("Read XNLI.en-de dataset") xnli_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) xnli_reader.load_data('../datasets/xnli-en-de.txt.gz') xnli_dataloader = DataLoader(xnli_reader, shuffle=False, batch_size=train_batch_size) xnli_mse = evaluation.MSEEvaluator(xnli_dataloader, name='xnli-en-de') evaluators.append(xnli_mse) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]), epochs=20, evaluation_steps=1000, warmup_steps=10000, scheduler='warmupconstant', output_path=output_path, save_best_model=True, optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False} )
logging.info("Teacher Performance with {} dimensions:".format(teacher_model.get_sentence_embedding_dimension())) dev_evaluator_sts(teacher_model) # We train the student_model such that it creates sentence embeddings similar to the embeddings from the teacher_model # For this, we need a large set of sentences. These sentences are embedded using the teacher model, # and the student tries to mimic these embeddings. It is the same approach as used in: https://arxiv.org/abs/2004.09813 train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=False) train_data.add_dataset([[sent] for sent in train_sentences_nli], max_sentence_length=256) train_data.add_dataset([[sent] for sent in train_sentences_wikipedia], max_sentence_length=256) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=student_model) # We create an evaluator, that measure the Mean Squared Error (MSE) between the teacher and the student embeddings dev_sentences = dev_sentences_nli + dev_sentences_wikipedia dev_evaluator_mse = evaluation.MSEEvaluator(dev_sentences, dev_sentences, teacher_model=teacher_model) # Train the student model to imitate the teacher student_model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluation.SequentialEvaluator([dev_evaluator_sts, dev_evaluator_mse]), epochs=1, warmup_steps=1000, evaluation_steps=5000, output_path=output_path, save_best_model=True, optimizer_params={'lr': 1e-4, 'eps': 1e-6, 'correct_bias': False}, use_amp=True)
max_sentences=None, max_sentence_length=train_conf.getint('MaxSentLen')) train_dataloader = DataLoader(train_data, batch_size=train_conf.getint('BatchSize')) #train_loss = CosineSimilarityLoss(model=student_model) train_loss = losses.MSELoss(model=student) logging.info('Assembling evaluator') df = pd.read_csv(data_path / datasets['DevSet'], sep='\t', header=None, quoting=3) dev_mse_evaluator = evaluation.MSEEvaluator( df.iloc[:, 0], df.iloc[:, 1], name='Dev-MSE-evaluator', teacher_model=teacher, batch_size=eval_conf.getint('BatchSize')) logging.info('Fitting..') dt = datetime.datetime.now(tz=datetime.timezone(datetime.timedelta( hours=2))).strftime("%Y-%m-%d-%H:%M:%S") output_path = output_path / dt student.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=dev_mse_evaluator, epochs=train_conf.getint('Epochs'), steps_per_epoch=train_conf.getint('Steps'), scheduler=config['SCHEDULER']['Scheduler'], warmup_steps=train_conf.getint('WarmUp'), evaluation_steps=eval_conf.getint('EvalSteps'),