예제 #1
0
    def test_on_the_fly(self):
        # Test should only be run if the necessary files already exist.
        self._assert_files_exist()

        # Load dataset
        dataset = load_or_cache_data(DATADIR, DATASET_NAME)

        # Load w2v embeddings
        w2v_model = load_word_vector_model(small=True, cache_dir=W2VDIR)
        vocab_w2v_embeddings, vocab = get_topk_w2v_vectors(w2v_model, k=VOCAB_SIZE)
        vocab_w2v_embeddings = to_tensor(vocab_w2v_embeddings)

        # Load SBERT embeddings
        vocab_sbert_filename = fewshot_filename(
            W2VDIR, f"sbert_embeddings_for_{VOCAB_SIZE}_words.pt"
        )
        cached_data = torch_load(vocab_sbert_filename)
        vocab_sbert_embeddings = cached_data["embeddings"]

        # Calculate linear map of best fit between maps.
        Zmap = OLS_with_l2_regularization(
            vocab_sbert_embeddings, vocab_w2v_embeddings
        )

        # Predict and score
        score, predictions = predict_and_score(dataset, linear_maps=[Zmap], return_predictions=True)
        score3 = simple_topk_accuracy(dataset.labels, predictions)

        self.assertAlmostEqual(score, 65.5657894736842)
        self.assertAlmostEqual(score3, 96.01315789473685)
예제 #2
0
def expand_labels(dataset: Dataset):
    """ 
    When performing supervised learning (e.g. few-shot), we will need a label embedding for 
    each example in the dataset. Most datasets only have a handful of labels (4-10).
    Passing these repeatedly through SBERT for each example is slow, repetitive and
    unnecessarily expensive. 

    Instead we'll restructure the dataset attributes. Originally instantiated, each label 
    has already been passed through SBERT and is stored in dataset.embeddings 
    as the last N items in the list. These are used to build out a full label embedding tensor.
    Additionally, dataset.embeddings is repurposed to contain ONLY example embeddings 
    rather than example AND label embeddings
    """

    num_labels = len(dataset.categories)
    label_embeddings = to_list(dataset.embeddings[-num_labels:])

    dataset.label_embeddings = to_tensor(
        [label_embeddings[label] for label in dataset.labels])
    #dataset.embeddings = dataset.embeddings[:-num_labels]
    return dataset
예제 #3
0
# -- generate w2v embeddings for each word in V,
# -- generate SBERT embeddings for each word in V,
# -- perform linear regression between the SBERT and w2v embeddings
# The result will be a matrix, Zmap, which we can use to transform SBERT embeddings
# and then perform classification with cosine similarity as before

# Load the w2v embedding model
w2v_model = load_word_vector_model(small=True, cache_dir=W2VDIR)
VOCAB_SIZE = 20000

import pdb

pdb.set_trace()
# We found that using a vocabulary size of 20,000 words is good for most applications
vocab_w2v_embeddings, vocab = get_topk_w2v_vectors(w2v_model, k=VOCAB_SIZE)
vocab_w2v_embeddings = to_tensor(vocab_w2v_embeddings)

# Passing 20k words through SBERT can be time-consuming, even with a GPU.
# Fortunately, we've already performed this step and include precomputed embeddings.
vocab_sbert_filename = fewshot_filename(
    W2VDIR, f"sbert_embeddings_for_{VOCAB_SIZE}_words.pt")

if os.path.exists(vocab_sbert_filename):
    cached_data = torch_load(vocab_sbert_filename)
    vocab_sbert_embeddings = cached_data["embeddings"]
else:
    model, tokenizer = load_transformer_model_and_tokenizer()
    vocab_sbert_embeddings = get_transformer_embeddings(
        vocab, model, tokenizer, output_filename=vocab_sbert_filename)

# Perform ordinary least-squares linear regression to learn Zmap