def discovery(embedding, vocab, chord_a, chord_b, chord_c, num_output=10): a_id = vocab.get_token_index(chord_a) b_id = vocab.get_token_index(chord_b) c_id = vocab.get_token_index(chord_c) vec_a = embedding.weight[a_id] vec_b = embedding.weight[b_id] vec_c = embedding.weight[c_id] cosine = CosineSimilarity(dim=0) sims = Counter() vec = vec_b - vec_a + vec_c for index, token in vocab.get_index_to_token_vocabulary().items(): sim = cosine(vec, embedding.weight[index]).item() sims[token] = sim return sims.most_common(num_output) vocab = Vocabulary().from_files("data/vocabulary") EMBEDDING_DIM = 128 token_embedding = Embedding(num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=EMBEDDING_DIM) token_embedding.load_state_dict( torch.load("saved_models/word2vec.th", map_location="cpu")) print(discovery(token_embedding, vocab, "C", "G", "G"))
except Exception as e: print(e) print(label) marker_list = [".", ".", ".", "v", "p", "D", "x", "*", "s"] return color_list[key_number], marker_list[form_index] vocab = Vocabulary().from_files("data/vocabulary") EMBEDDING_DIM = 128 token_embedding = Embedding(num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=EMBEDDING_DIM) token_embedding.load_state_dict(torch.load("saved_models/word2vec.th")) token_ids = torch.tensor([x for x in range(2, vocab.get_vocab_size())], dtype=torch.long) if torch.cuda.is_available(): cuda_device = 0 token_embedding = token_embedding.cuda(cuda_device) token_ids = token_ids.cuda(cuda_device) else: cuda_device = -1 token_embedding.eval() with torch.no_grad(): embeddings = token_embedding(token_ids).cpu().numpy()