modules=[word_embedding_model, pooling_model]) ###### Read Parallel Sentences Dataset ###### train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True) for train_file in train_files: train_data.load_data(train_file, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=student_model) #### Evaluate cross-lingual performance on different tasks ##### evaluators = [ ] #evaluators has a list of different evaluator classes we call periodically for dev_file in dev_files: logging.info("Create evaluator for " + dev_file) src_sentences = [] trg_sentences = [] with gzip.open(dev_file, 'rt', encoding='utf8') as fIn: for line in fIn: splits = line.strip().split('\t') if splits[0] != "" and splits[1] != "": src_sentences.append(splits[0]) trg_sentences.append(splits[1])
model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # logging.info("Loading previously trained student-teacher model") # model = SentenceTransformer('models/hindi-sxlmr-stmodel') output_path = 'models/se-asian-sbert' logging.info("Create dataset reader") ###### Read Dataset ###### train_file_path = 'train_southeast_asian_parallel_corpus.txt' train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) train_data.load_data(train_file_path) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=model) ###### Load dev sets ###### # Test on STS 2017.en-de dataset using Spearman rank correlation logging.info("Read dev dataset") evaluators = [] claim_pair_reader = ClaimPairDataReader() dev_data = SentencesDataset(examples=claim_pair_reader.get_examples(split='train', language='hi'), model=model) # dev_file_path = 'test_southeast_asian_parallel_corpus.txt' # dev_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) # dev_data.load_data(dev_file_path) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='SE Asian Test Data') evaluators.append(evaluator_sts)