def load_linear_maps(): zmap_basic = torch_load(fewshot_filename(DATADIR+"/maps", "Zmap_20k_w2v_words_alpha0.pt")) zmap_optimized = torch_load(fewshot_filename(DATADIR+"/maps", "Zmap_20k_w2v_words_alpha10_news.pt")) wmap = torch_load(fewshot_filename(DATADIR+"/maps", "Wmap_agnews_lr0.1_lam500_500expercat.pt")) MAPPINGS = { 'Zmap (standard)': zmap_basic, 'Zmap (optimized for AG News)' : zmap_optimized, 'Wmap (trained on 2000 AG News examples)': wmap } return MAPPINGS
def test_on_the_fly(self): # Test should only be run if the necessary files already exist. self._assert_files_exist() # Load dataset dataset = load_or_cache_data(DATADIR, DATASET_NAME) # Load w2v embeddings w2v_model = load_word_vector_model(small=True, cache_dir=W2VDIR) vocab_w2v_embeddings, vocab = get_topk_w2v_vectors(w2v_model, k=VOCAB_SIZE) vocab_w2v_embeddings = to_tensor(vocab_w2v_embeddings) # Load SBERT embeddings vocab_sbert_filename = fewshot_filename( W2VDIR, f"sbert_embeddings_for_{VOCAB_SIZE}_words.pt" ) cached_data = torch_load(vocab_sbert_filename) vocab_sbert_embeddings = cached_data["embeddings"] # Calculate linear map of best fit between maps. Zmap = OLS_with_l2_regularization( vocab_sbert_embeddings, vocab_w2v_embeddings ) # Predict and score score, predictions = predict_and_score(dataset, linear_maps=[Zmap], return_predictions=True) score3 = simple_topk_accuracy(dataset.labels, predictions) self.assertAlmostEqual(score, 65.5657894736842) self.assertAlmostEqual(score3, 96.01315789473685)
VOCAB_SIZE = 20000 import pdb pdb.set_trace() # We found that using a vocabulary size of 20,000 words is good for most applications vocab_w2v_embeddings, vocab = get_topk_w2v_vectors(w2v_model, k=VOCAB_SIZE) vocab_w2v_embeddings = to_tensor(vocab_w2v_embeddings) # Passing 20k words through SBERT can be time-consuming, even with a GPU. # Fortunately, we've already performed this step and include precomputed embeddings. vocab_sbert_filename = fewshot_filename( W2VDIR, f"sbert_embeddings_for_{VOCAB_SIZE}_words.pt") if os.path.exists(vocab_sbert_filename): cached_data = torch_load(vocab_sbert_filename) vocab_sbert_embeddings = cached_data["embeddings"] else: model, tokenizer = load_transformer_model_and_tokenizer() vocab_sbert_embeddings = get_transformer_embeddings( vocab, model, tokenizer, output_filename=vocab_sbert_filename) # Perform ordinary least-squares linear regression to learn Zmap Zmap = OLS_with_l2_regularization(vocab_sbert_embeddings, vocab_w2v_embeddings) score, predictions = predict_and_score(dataset, linear_maps=[Zmap], return_predictions=True) score3 = simple_topk_accuracy(dataset.labels, predictions) print( f"Score using projection matrix with top {VOCAB_SIZE} w2v words: {score}")