예제 #1
0
def load_linear_maps():
    zmap_basic = torch_load(fewshot_filename(DATADIR+"/maps", "Zmap_20k_w2v_words_alpha0.pt"))
    zmap_optimized = torch_load(fewshot_filename(DATADIR+"/maps", "Zmap_20k_w2v_words_alpha10_news.pt"))
    wmap = torch_load(fewshot_filename(DATADIR+"/maps", "Wmap_agnews_lr0.1_lam500_500expercat.pt"))

    MAPPINGS = {
        'Zmap (standard)': zmap_basic,
        'Zmap (optimized for AG News)' : zmap_optimized,
        'Wmap (trained on 2000 AG News examples)': wmap
    }
    return MAPPINGS
예제 #2
0
    def test_on_the_fly(self):
        # Test should only be run if the necessary files already exist.
        self._assert_files_exist()

        # Load dataset
        dataset = load_or_cache_data(DATADIR, DATASET_NAME)

        # Load w2v embeddings
        w2v_model = load_word_vector_model(small=True, cache_dir=W2VDIR)
        vocab_w2v_embeddings, vocab = get_topk_w2v_vectors(w2v_model, k=VOCAB_SIZE)
        vocab_w2v_embeddings = to_tensor(vocab_w2v_embeddings)

        # Load SBERT embeddings
        vocab_sbert_filename = fewshot_filename(
            W2VDIR, f"sbert_embeddings_for_{VOCAB_SIZE}_words.pt"
        )
        cached_data = torch_load(vocab_sbert_filename)
        vocab_sbert_embeddings = cached_data["embeddings"]

        # Calculate linear map of best fit between maps.
        Zmap = OLS_with_l2_regularization(
            vocab_sbert_embeddings, vocab_w2v_embeddings
        )

        # Predict and score
        score, predictions = predict_and_score(dataset, linear_maps=[Zmap], return_predictions=True)
        score3 = simple_topk_accuracy(dataset.labels, predictions)

        self.assertAlmostEqual(score, 65.5657894736842)
        self.assertAlmostEqual(score3, 96.01315789473685)
예제 #3
0
VOCAB_SIZE = 20000

import pdb

pdb.set_trace()
# We found that using a vocabulary size of 20,000 words is good for most applications
vocab_w2v_embeddings, vocab = get_topk_w2v_vectors(w2v_model, k=VOCAB_SIZE)
vocab_w2v_embeddings = to_tensor(vocab_w2v_embeddings)

# Passing 20k words through SBERT can be time-consuming, even with a GPU.
# Fortunately, we've already performed this step and include precomputed embeddings.
vocab_sbert_filename = fewshot_filename(
    W2VDIR, f"sbert_embeddings_for_{VOCAB_SIZE}_words.pt")

if os.path.exists(vocab_sbert_filename):
    cached_data = torch_load(vocab_sbert_filename)
    vocab_sbert_embeddings = cached_data["embeddings"]
else:
    model, tokenizer = load_transformer_model_and_tokenizer()
    vocab_sbert_embeddings = get_transformer_embeddings(
        vocab, model, tokenizer, output_filename=vocab_sbert_filename)

# Perform ordinary least-squares linear regression to learn Zmap
Zmap = OLS_with_l2_regularization(vocab_sbert_embeddings, vocab_w2v_embeddings)

score, predictions = predict_and_score(dataset,
                                       linear_maps=[Zmap],
                                       return_predictions=True)
score3 = simple_topk_accuracy(dataset.labels, predictions)
print(
    f"Score using projection matrix with top {VOCAB_SIZE} w2v words: {score}")