def test_on_the_fly(self): # Test should only be run if the necessary files already exist. self._assert_files_exist() # Load dataset dataset = load_or_cache_data(DATADIR, DATASET_NAME) # Load w2v embeddings w2v_model = load_word_vector_model(small=True, cache_dir=W2VDIR) vocab_w2v_embeddings, vocab = get_topk_w2v_vectors(w2v_model, k=VOCAB_SIZE) vocab_w2v_embeddings = to_tensor(vocab_w2v_embeddings) # Load SBERT embeddings vocab_sbert_filename = fewshot_filename( W2VDIR, f"sbert_embeddings_for_{VOCAB_SIZE}_words.pt" ) cached_data = torch_load(vocab_sbert_filename) vocab_sbert_embeddings = cached_data["embeddings"] # Calculate linear map of best fit between maps. Zmap = OLS_with_l2_regularization( vocab_sbert_embeddings, vocab_w2v_embeddings ) # Predict and score score, predictions = predict_and_score(dataset, linear_maps=[Zmap], return_predictions=True) score3 = simple_topk_accuracy(dataset.labels, predictions) self.assertAlmostEqual(score, 65.5657894736842) self.assertAlmostEqual(score3, 96.01315789473685)
def expand_labels(dataset: Dataset): """ When performing supervised learning (e.g. few-shot), we will need a label embedding for each example in the dataset. Most datasets only have a handful of labels (4-10). Passing these repeatedly through SBERT for each example is slow, repetitive and unnecessarily expensive. Instead we'll restructure the dataset attributes. Originally instantiated, each label has already been passed through SBERT and is stored in dataset.embeddings as the last N items in the list. These are used to build out a full label embedding tensor. Additionally, dataset.embeddings is repurposed to contain ONLY example embeddings rather than example AND label embeddings """ num_labels = len(dataset.categories) label_embeddings = to_list(dataset.embeddings[-num_labels:]) dataset.label_embeddings = to_tensor( [label_embeddings[label] for label in dataset.labels]) #dataset.embeddings = dataset.embeddings[:-num_labels] return dataset
# -- generate w2v embeddings for each word in V, # -- generate SBERT embeddings for each word in V, # -- perform linear regression between the SBERT and w2v embeddings # The result will be a matrix, Zmap, which we can use to transform SBERT embeddings # and then perform classification with cosine similarity as before # Load the w2v embedding model w2v_model = load_word_vector_model(small=True, cache_dir=W2VDIR) VOCAB_SIZE = 20000 import pdb pdb.set_trace() # We found that using a vocabulary size of 20,000 words is good for most applications vocab_w2v_embeddings, vocab = get_topk_w2v_vectors(w2v_model, k=VOCAB_SIZE) vocab_w2v_embeddings = to_tensor(vocab_w2v_embeddings) # Passing 20k words through SBERT can be time-consuming, even with a GPU. # Fortunately, we've already performed this step and include precomputed embeddings. vocab_sbert_filename = fewshot_filename( W2VDIR, f"sbert_embeddings_for_{VOCAB_SIZE}_words.pt") if os.path.exists(vocab_sbert_filename): cached_data = torch_load(vocab_sbert_filename) vocab_sbert_embeddings = cached_data["embeddings"] else: model, tokenizer = load_transformer_model_and_tokenizer() vocab_sbert_embeddings = get_transformer_embeddings( vocab, model, tokenizer, output_filename=vocab_sbert_filename) # Perform ordinary least-squares linear regression to learn Zmap