def test_LabelAccuracyEvaluator(self): model = SentenceTransformer('paraphrase-distilroberta-base-v1') nli_dataset_path = 'datasets/AllNLI.tsv.gz' if not os.path.exists(nli_dataset_path): util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path) label2int = {"contradiction": 0, "entailment": 1, "neutral": 2} dev_samples = [] with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['split'] == 'train': label_id = label2int[row['label']] dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=label_id)) if len(dev_samples) >= 100: break train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)) dev_dataloader = DataLoader(dev_samples, shuffle=False, batch_size=16) evaluator = evaluation.LabelAccuracyEvaluator(dev_dataloader, softmax_model=train_loss) acc = evaluator(model) assert acc > 0.2
class TransformerSentencesEmbedding: def __init__(self): self.model = SentenceTransformer('bert-base-nli-mean-tokens') self.embeddings_dim = self.model.get_sentence_embedding_dimension() def sentences_encode(self, sentences): return self.model.encode(sentences)
class SentenceTransformerRecSys(KeyedVectorRecSys): model_name_or_path = None batch_size = 12 language_model = None def train(self, texts: List): from sentence_transformers import SentenceTransformer # load sentence transformer model if not self.language_model: logger.info( f'Loading Sentence Transformer: {self.model_name_or_path}') self.language_model = SentenceTransformer(self.model_name_or_path) # reset doc vector model self.model = KeyedVectors( vector_size=self.language_model.get_sentence_embedding_dimension()) # encode sentence_embeddings = self.language_model.encode( texts, batch_size=self.batch_size, show_progress_bar=self.print_progress) # save into keyed vector for idx, vec in enumerate(sentence_embeddings): self.model.add([str(self.idx2doc_id[idx])], [vec]) return self.model
def __init__(self, model_names): self.emb_dim = 0 # args is a list with a list of all models for i, arg in enumerate(model_names): sentence_model = SentenceTransformer(arg) self.emb_dim += sentence_model.get_sentence_embedding_dimension() new_model_att = {"model_"+str(i): sentence_model} self.__dict__.update(new_model_att)
class SentenceTransformerDocumentEmbeddings(DocumentEmbeddings): def __init__( self, model: str = "bert-base-nli-mean-tokens", batch_size: int = 1, convert_to_numpy: bool = False, ): """ :param model: string name of models from SentencesTransformer Class :param name: string name of embedding type which will be set to Sentence object :param batch_size: int number of sentences to processed in one batch :param convert_to_numpy: bool whether the encode() returns a numpy array or PyTorch tensor """ super().__init__() try: from sentence_transformers import SentenceTransformer except ModuleNotFoundError: log.warning("-" * 100) log.warning('ATTENTION! The library "sentence-transformers" is not installed!') log.warning( 'To use Sentence Transformers, please first install with "pip install sentence-transformers"' ) log.warning("-" * 100) pass self.model = SentenceTransformer(model) self.name = 'sentence-transformers-' + str(model) self.batch_size = batch_size self.convert_to_numpy = convert_to_numpy self.static_embeddings = True def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: sentence_batches = [sentences[i * self.batch_size:(i + 1) * self.batch_size] for i in range((len(sentences) + self.batch_size - 1) // self.batch_size)] for batch in sentence_batches: self._add_embeddings_to_sentences(batch) return sentences def _add_embeddings_to_sentences(self, sentences: List[Sentence]): # convert to plain strings, embedded in a list for the encode function sentences_plain_text = [sentence.to_plain_string() for sentence in sentences] embeddings = self.model.encode(sentences_plain_text, convert_to_numpy=self.convert_to_numpy) for sentence, embedding in zip(sentences, embeddings): sentence.set_embedding(self.name, embedding) @property @abstractmethod def embedding_length(self) -> int: """Returns the length of the embedding vector.""" return self.model.get_sentence_embedding_dimension()
def test_simple_sentence_transformers_from_disk(self): model = SentenceTransformer( self.env['datasets_dir'] + '/sentence_transformers/bert-base-nli-mean-tokens') # sentence_embeddings = model.encode(self.texts) # # for sentence, embedding in zip(self.texts, sentence_embeddings): # print("Sentence:", sentence) # print("Embedding:", embedding) # print("") self.assertEqual(768, model.get_sentence_embedding_dimension())
def __init__(self, name_model: str = "roberta-base-nli-stsb-mean-tokens", device: str = None, multiple_sentences: bool = False) -> None: self.name_model = name_model self.device = device self.multiple_sentences = multiple_sentences #Load the Model if self.name_model in MODELS: model = SentenceTransformer(self.name_model, device=self.device) self.embding_dim = model.get_sentence_embedding_dimension() self.model = model else: #TODO:Description raise ValueError("Error")
def train_sbert(model_name, model_save_path): batch_size = 16 nli_reader, sts_reader = load_dataset() train_num_labels = nli_reader.get_num_labels() # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training num_epochs = 1 warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path ) model = SentenceTransformer(model_save_path) test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)
def build_model(num_labels): model_name = 'bert-base-uncased' # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) train_loss = new_softmax_loss.SoftmaxLoss( model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=num_labels, num_vectors=3) return model, train_loss
class CachedSentenceTransformer: def __init__(self, model_name: str): super().__init__() self._model = SentenceTransformer(model_name) self._cache = Cache(cache_directory / model_name) def featurize(self, sentences: List[str]) -> np.ndarray: result = [] for sentence in sentences: if sentence in self._cache: vec = self._cache[sentence] else: vec = self._model.encode(sentence).squeeze() self._cache[sentence] = vec result.append(vec) return np.array(result) def get_dimension(self) -> int: return self._model.get_sentence_embedding_dimension()
def __init__( self, path_embedding_sentences: Union[str, Path], path_cuases: Union[str, Path], path_embedding_causes: Union[str, Path], by_augmentation: int = 3, sentences_embedding_model: str = "roberta-base-nli-stsb-mean-tokens" ): self.path_embedding_sentences = path_embedding_sentences self.path_embedding_causes = path_embedding_causes self.path_cuases = path_cuases self.by_augmentation = by_augmentation self.sentences_embedding_model = sentences_embedding_model #Load the Model if self.sentences_embedding_model in MODELS: model = SentenceTransformer(self.sentences_embedding_model) self.embding_dim = model.get_sentence_embedding_dimension() self.model = model else: #TODO:Description raise ValueError("Error")
class SentenceVectorizer(BaseVectorizer): """Vectorize text by using sentence transformers https://github.com/UKPLab/sentence-transformers """ CONF_KEY_TRAINED_MODEL_PATH = "model_path" CONF_KEY_TRANSFORMER_MODEL_NAME = "transformer_model_name" def __init__(self, model_path: str, transformer_model_name: str): # Reference: # https://github.com/UKPLab/sentence-transformers/blob/e0aa596a0397a41ba69f75c1124318f0cb1dceca/sentence_transformers/models/Transformer.py self.model = SentenceTransformer(model_path) self.tokenizer = AutoTokenizer.from_pretrained(transformer_model_name) self.model.tokenizer = self.tokenizer self.word_embedding_dim = self.model.get_sentence_embedding_dimension() @classmethod def create(cls: "SentenceVectorizer", config: Dict[str, Any]): return cls( config[cls.CONF_KEY_TRAINED_MODEL_PATH], config[cls.CONF_KEY_TRANSFORMER_MODEL_NAME], ) def encode(self, sentences: List[str], padding: bool = True): if len(sentences) == 0: return None return self.model.tokenizer(sentences, return_tensors="pt", padding=padding) def decode(self, encode_result): return self.model.tokenizer.convert_ids_to_tokens( encode_result.input_ids.flatten().tolist()) def vectorize(self, sentences): vectors = self.model.encode(sentences) return [vectors[i][:].tolist() for i in range(len(sentences))]
if os.path.isfile(labels_file): os.remove(os.path.join(curr_dir, "prediction_labels.csv")) if os.path.isfile(pred_file): os.remove(os.path.join(curr_dir, "prediction_results.csv")) # Model path model_save_path = curr_dir batch_size = 24 agb_reader = TestAGBReader('datasets/og-test') train_num_labels = agb_reader.get_num_labels() model = SentenceTransformer(model_save_path, device="cpu") train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) train_loss.classifier = torch.load(os.path.join(model_save_path, "2_Softmax/pytorch_model.bin")) print("test") test_dir = "/data/daumiller/sentence-transformers/examples/datasets/og-test" for fn in sorted(os.listdir(test_dir)): examples = agb_reader.get_examples(fn) if not examples: continue # Hack to avoid problems with docs almost as long as batch size if len(examples) == batch_size + 1: batch_size_used = batch_size - 3 else: batch_size_used = batch_size test_data = SentencesDataset(examples=examples, model=model, shorten=True)
word_embedding_model = models.BERT('bert-base-uncased') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training num_epochs = 1 warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps))
for row in reader: if row['split'] == 'dev': label_id = label2int[row['label']] acc_samples.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=label_id)) train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size) acc_dataloader = DataLoader(acc_samples, shuffle=True, batch_size=train_batch_size) print("sent embed:", model.get_sentence_embedding_dimension()) # train_loss = losses.BatchSemiHardTripletLoss(model=model) train_loss = losses.SoftmaxLoss( model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)) #Read STSbenchmark dataset and use it as development set logging.info("Read STSbenchmark dev dataset") dev_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['split'] == 'dev': score = float(
return avg_abs_emb ###################################################### ####################### CORPUS ####################### ###################################################### # # SciSpacy model to tokenize text print("-------- Loading scispacy en_core_sci_sm model --------") nlp = en_core_sci_sm.load(disable=['ner', 'tagger']) nlp.max_length = 2000000 # Sentence Transformer model logger.info("-------- Loading SentenceTransformer model --------") embedder = SentenceTransformer(options.model) dim = embedder.get_sentence_embedding_dimension() # # Corpus logger.info("-------- Building corpus --------") df_docs.title = df_docs.title.fillna("") df_docs.abstract = df_docs.abstract.fillna("") df_docs.fulltext = df_docs.fulltext.fillna("") corpus_list = [] name_corpus_list = [] if options.fulltext: fulltext_corpus = df_docs.fulltext.to_list() corpus_list.append(fulltext_corpus) name_corpus_list.append("fulltext") if options.abstract: abstract_corpus = df_docs.abstract.to_list()
def extract(): for id_model, m in enumerate(args.model_name): print( '****************************************************************') print('EXTRACTION MODEL: %s' % m) if args.text_output_split[id_model]: # create directories for split if not os.path.exists('../data/{0}/original/{1}_{2}'.format( args.dataset, args.input_file, m.lower())): os.makedirs('../data/{0}/original/{1}_{2}'.format( args.dataset, args.input_file, m.lower())) # model setting text_model = SentenceTransformer(args.model_name[id_model]) # dataset setting data = read_csv('../data/{0}/original/{1}.tsv'.format( args.dataset, args.input_file), sep='\t') print('Loaded dataset from %s' % descriptions_path.format(args.dataset)) # text features text_features = np.empty( shape=[len(data), text_model.get_sentence_embedding_dimension()]) # features extraction print('Starting extraction...\n') start = time.time() for index, row in data.iterrows(): # text features extraction text_features[index] = text_model.encode( sentences=str(row[args.column])) if (index + 1) % args.print_each == 0: sys.stdout.write('\r%d/%d samples completed' % (index + 1, len(data))) sys.stdout.flush() end = time.time() print('\n\nFeature extraction completed in %f seconds.' % (end - start)) if args.normalize: text_features = text_features / np.max(np.abs(text_features)) if args.text_output_split[id_model]: for d in range(len(data)): save_np(npy=text_features[d], filename='../data/{0}/original/{1}_{2}'.format( args.dataset, args.input_file, m.lower()) + str(d) + '.npy') print('Saved text features numpy to ==> %s' % text_features_dir.format(args.dataset, m.lower())) else: save_np(npy=text_features, filename='../data/{0}/original/{1}_{2}.npy'.format( args.dataset, args.input_file, m.lower())) print('Saved text features numpy to ==> %s' % '../data/{0}/original/{1}_{2}.npy'.format( args.dataset, args.input_file, m.lower()))
dev_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['split'] == 'dev': score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1 dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) dev_evaluator_sts = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') logging.info("Teacher Performance:") dev_evaluator_sts(teacher_model) # Student model has fewer dimensions. Compute PCA for the teacher to reduce the dimensions if student_model.get_sentence_embedding_dimension() < teacher_model.get_sentence_embedding_dimension(): logging.info("Student model has fewer dimensions than the teacher. Compute PCA for down projection") pca_sentences = train_sentences_nli[0:20000] + train_sentences_wikipedia[0:20000] pca_embeddings = teacher_model.encode(pca_sentences, convert_to_numpy=True) pca = PCA(n_components=student_model.get_sentence_embedding_dimension()) pca.fit(pca_embeddings) #Add Dense layer to teacher that projects the embeddings down to the student embedding size dense = models.Dense(in_features=teacher_model.get_sentence_embedding_dimension(), out_features=student_model.get_sentence_embedding_dimension(), bias=False, activation_function=torch.nn.Identity()) dense.linear.weight = torch.nn.Parameter(torch.tensor(pca.components_)) teacher_model.add_module('dense', dense) logging.info("Teacher Performance with {} dimensions:".format(teacher_model.get_sentence_embedding_dimension())) dev_evaluator_sts(teacher_model)
sentence = line_source.strip() train_sent.append(sentence) if min_sent_len <= len(line_target.strip()) <= max_sent_len: sentence = line_target.strip() train_sent.append(sentence) if len(train_sent) >= num_train_sent: break print("Encode training embeddings for PCA") train_matrix = model.encode(train_sent, show_progress_bar=True, convert_to_numpy=True) pca = PCA(n_components=pca_dimensions) pca.fit(train_matrix) dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=pca_dimensions, bias=False, activation_function=torch.nn.Identity()) dense.linear.weight = torch.nn.Parameter(torch.tensor(pca.components_)) model.add_module('dense', dense) print("Read source file") source_sentences = set() with file_open(source_file) as fIn: for line in tqdm.tqdm(fIn): line = line.strip() if len(line) >= min_sent_len and len(line) <= max_sent_len: source_sentences.add(line) print("Read target file") target_sentences = set() with file_open(target_file) as fIn:
def train_nli(): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' model_name = 'pretrained_model/bert-base-uncased' # Read the dataset train_batch_size = 6 nli_reader = NLIDataReader('./examples/datasets/AllNLI') sts_reader = STSBenchmarkDataReader('./examples/datasets/stsbenchmark') train_num_labels = nli_reader.get_num_labels() model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_dataset = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = LabelAccuracyEvaluator(dev_dataloader,softmax_model = Softmax_label(model = model, sentence_embedding_dimension = model.get_sentence_embedding_dimension(), num_labels = train_num_labels)) # Configure the training num_epochs = 1 warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=100, warmup_steps=warmup_steps, output_path=model_save_path ) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## #model = SentenceTransformer(model_save_path) test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) #evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)
nli_sentences = list(nli_sentences) random.shuffle(nli_sentences) #To determine the PCA matrix, we need some example sentence embeddings. #Here, we compute the embeddings for 20k random sentences from the AllNLI dataset pca_train_sentences = nli_sentences[0:20000] train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True) #Compute PCA on the train embeddings matrix pca = PCA(n_components=new_dimension) pca.fit(train_embeddings) pca_comp = np.asarray(pca.components_) # We add a dense layer to the model, so that it will produce directly embeddings with the new size dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity()) dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp)) model.add_module('dense', dense) # Evaluate the model with the reduce embedding size logger.info("Model with {} dimensions:".format(new_dimension)) stsb_evaluator(model) # If you like, you can store the model on disc by uncommenting the following line #model.save('models/bert-base-nli-stsb-mean-tokens-128dim') # You can then load the adapted model that produces 128 dimensional embeddings like this: #model = SentenceTransformer('models/bert-base-nli-stsb-mean-tokens-128dim')
#ANN: Faster, but the recall will be lower use_ann_search = True #Number of clusters for ANN. Optimal number depends on dataset size ann_num_clusters = 32768 #How many cluster to explorer for search. Higher number = better recall, slower ann_num_cluster_probe = 5 #To save memory, we can use PCA to reduce the dimensionality from 768 to for example 128 dimensions #The encoded embeddings will hence require 6 times less memory. However, we observe a small drop in performance. use_pca = False pca_dimensions = 128 #We store the embeddings on disc, so that they can later be loaded from disc source_embedding_file = '{}_{}_{}.emb'.format(model_name, os.path.basename(source_file), pca_dimensions if use_pca else model.get_sentence_embedding_dimension()) target_embedding_file = '{}_{}_{}.emb'.format(model_name, os.path.basename(target_file), pca_dimensions if use_pca else model.get_sentence_embedding_dimension()) #Use PCA to reduce the dimensionality of the sentence embedding model if use_pca: # We use a smaller number of training sentences to learn the PCA train_sent = [] num_train_sent = 20000 with open(source_file, encoding='utf8') as fSource, open(target_file, encoding='utf8') as fTarget: for line_source, line_target in zip(fSource, fTarget): id, sentence = line_source.strip().split("\t", maxsplit=1) train_sent.append(sentence) id, sentence = line_target.strip().split("\t", maxsplit=1)
for row in reader: if row['split'] == 'dev': score = float( row['score']) / 5.0 #Normalize score to range 0 ... 1 dev_samples.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) dev_evaluator_sts = evaluation.EmbeddingSimilarityEvaluator.from_input_examples( dev_samples, name='sts-dev') logging.info("Teacher Performance:") dev_evaluator_sts(teacher_model) # Student Model has fewer dimension. Compute PCA for the teacher to reduce the dimensions if student_model.get_sentence_embedding_dimension( ) < teacher_model.get_sentence_embedding_dimension(): logging.info( "Student model has fewer dimensions that the teacher. Compute PCA for down projection" ) pca_sentences = train_sentences_nli[0:25000] pca_embeddings = teacher_model.encode(pca_sentences, convert_to_numpy=True) pca = PCA(n_components=student_model.get_sentence_embedding_dimension()) pca.fit(pca_embeddings) #Add Dense layer to teacher that projects the embeddings down to the student embedding size dense = models.Dense( in_features=teacher_model.get_sentence_embedding_dimension(), out_features=student_model.get_sentence_embedding_dimension(), bias=False, activation_function=torch.nn.Identity()) dense.linear.weight = torch.nn.Parameter(torch.tensor(pca.components_))
class SentenceBERTForRD(nn.Module): def __init__(self, pretrained_name, out_dim, *sbert_args, freeze_sbert=True, criterion=None, **sbert_kwargs): ''' To use this model, you will need to first run "pip install sentence-transformers" Should be used in conjunction with the WantWordsDataset class, i.e.: >>> model = SentenceBERTForRD(...) >>> dataset = WantWordsDataset(definitions, embeddings, model.tokenizer) pretrained_name: Name of pretrained SentenceBERT variant to be used vocab_size: Size of output vocabulary freeze_sbert: Can optionally freeze SentenceBERT model and train only output MLP criterion: (optional) Must be one of CrossEntropyLoss, MSELoss, and CosineSimilarity ''' super(SentenceBERTForRD, self).__init__() self.sbert = SentenceTransformer(pretrained_name, *sbert_args, **sbert_kwargs) self.pretrained_name = pretrained_name self.freeze_sbert = freeze_sbert if freeze_sbert: for param in self.sbert.parameters(): param.requires_grad = False hidden_dim = self.sbert.get_sentence_embedding_dimension() # Simple MLP decoder --> modeled off of BERT MLM head self.decoder = nn.Sequential( nn.Linear(hidden_dim, hidden_dim), nn.GELU(), nn.LayerNorm(hidden_dim), nn.Linear(hidden_dim, out_dim), ) self.criterion = criterion self.classification = None if criterion is not None: if isinstance(criterion, nn.CrossEntropyLoss): self.classification = True elif isinstance(criterion, (nn.MSELoss, nn.CosineSimilarity)): self.classification = False else: raise Exception( "Criterion must be one of CrossEntropyLoss, MSELoss, or CosineSimilarity" ) # init weights of linear layer for layer in self.decoder.modules(): if isinstance(layer, nn.Linear): nn.init.normal_(layer.weight, mean=0.0, std=0.02) nn.init.zeros_(layer.bias) def unfreeze(self): for param in self.sbert.parameters(): param.requires_grad = True def forward(self, input_ids, attention_mask, ground_truth=None): # embed: (batch, 768) embed = self.sbert({ 'input_ids': input_ids, 'attention_mask': attention_mask })['sentence_embedding'] # out: (batch, vocab_size) # prob distribution over vocabulary out = self.decoder(embed) if self.criterion is not None and ground_truth is not None: loss = self.criterion(out, ground_truth) return loss, out return out
def build_vectors(st_output_path: str, hf_dataset: str, aspect: str, fold: Union[int, str], include_all_docs: bool = False, override: bool = False): """ :param override: :param include_all_docs: Generate also vectors for samples from training data :param st_output_path: Path to Sentence Transformer model :param hf_dataset: Huggingface dataset path or name :param aspect: :param fold: :return: """ max_token_length = 336 # ssee pwc_token_stats.ipynb nlp_cache_dir = './data/nlp_cache' out_fn = 'pwc_id2vec__all_docs.w2v.txt' if include_all_docs else 'pwc_id2vec.w2v.txt' out_fp = os.path.join(st_output_path, out_fn) if not os.path.exists(st_output_path): logger.error( f'Sentence Transformer directory does not exist: {st_output_path}') return if os.path.exists(out_fp) and not override: logger.error( f'Output path exists already and override is disabled: {out_fp}') return # Inference for best model best_model = SentenceTransformer(st_output_path) best_model.get_sentence_embedding_dimension() test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_test_split(aspect, fold)) docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir=nlp_cache_dir, split='docs') test_sds = DocumentPairSentencesDataset(docs_ds, test_ds, best_model) if include_all_docs: # use all document ids input_paper_ids = set(docs_ds['paper_id']) logger.info(f'All documents in corpus: {len(input_paper_ids):,}') else: # generate vectors from unique test documents only input_paper_ids = set(test_ds['from_paper_id']).union( set(test_ds['to_paper_id'])) with open(out_fp, 'w') as f: # header f.write( f'{len(input_paper_ids)} {best_model.get_sentence_embedding_dimension()}\n' ) # body for paper_id in tqdm(input_paper_ids, desc='Inference'): vec = [ str(v) for v in best_model.encode(test_sds.get_text_from_doc( paper_id), show_progress_bar=False) ] assert len(vec) == best_model.get_sentence_embedding_dimension() vec_str = ' '.join(vec) line = f'{paper_id} {vec_str}\n' f.write(line) # break logger.info(f'Encoded {len(input_paper_ids):,} into {out_fp}')