def put_embeddings(self, rSubmission: RedditSubmission): vecs = Magnitude('word2vec/light/GoogleNews-vectors-negative300') rSubmission.post_title_embedding = np.mean(vecs.query( rSubmission.post_title.split()), axis=0) if len(rSubmission.post_text) > 0: rSubmission.post_text_embedding = np.mean(vecs.query( rSubmission.post_text.split()), axis=0) return rSubmission
def put_embeddings(self, rSubmission: RedditSubmission): vecs = Magnitude('word2vec/light/GoogleNews-vectors-negative300') # vecs = Magnitude('http://magnitude.plasticity.ai/word2vec/light/GoogleNews-vectors-negative300.magnitude') rSubmission.post_title_embedding = np.mean(vecs.query( rSubmission.post_title.split()), axis=0) if len(rSubmission.post_text) > 0: rSubmission.post_text_embedding = np.mean(vecs.query( rSubmission.post_text.split()), axis=0) logger.info('Embedded submission: ', rSubmission.post_title) return rSubmission
def create_vocab_tensors(input_vocab_index): """Creates a matrix of the glove embeddings for terms contained in the model for improve runtime Also used in ESIM""" print('Creating vocabulary tensors...') # Define GloVe model from Magnitude package model = Magnitude(config.glove_magnitude_path) np.random.seed(config.SEED) # Randomly initialize matrix vocab_tensors = np.random.normal( 0, 1, (input_vocab_index.n_words, model.dim)).astype('float32') vocab_words = list(input_vocab_index.word2index.keys()) unk_words = [] # Get vector for each word in vocabulary if in model for idx, word in enumerate(vocab_words): if word in model: vocab_tensors[idx] = model.query(word) else: unk_words.append(word) # Override special tokens special_tokens = ['SOS', 'EOS', 'UNK'] # Override special tokens vocab_tensors[:len(special_tokens), :] = np.random.uniform( -0.1, 0.1, (len(special_tokens), model.dim)).astype('float32') print('Tensor vocabulary complete.') print(' Total vocabulary size {}, {} UNK words ({:.2}%)'.format( len(vocab_words), len(unk_words), (len(unk_words) / len(vocab_words)) * 100)) return torch.tensor(vocab_tensors, dtype=torch.float64), unk_words
class GloveEncoder(): """Encodes an input sentence as a mean or max pooled sentence embedding given the individual word embeddings""" def __init__(self, pooling='mean'): self.name = 'GloveEncoder' self.trainable_model = False self.pooling = pooling self.model = Magnitude(config.glove_magnitude_path) self.hidden_size = self.model.dim def sentence_embedding(self, input_text): words_in_model = [ word for word in input_text.split() if word in self.model ] sentence_embedding = np.zeros((len(words_in_model), self.model.dim)) sentence_embedding.fill(np.nan) for idx, token in enumerate(words_in_model): sentence_embedding[idx] = self.model.query(token) if self.pooling == 'max': sentence_embedding = np.max(sentence_embedding, axis=0) else: sentence_embedding = np.mean(sentence_embedding, axis=0) return torch.tensor(sentence_embedding.reshape(1, 1, -1), device=DEVICE)
class MagnitudeFactory(EmbeddingFactory): def __init__(self, embedding_type: EmbeddingType): super().__init__(embedding_type) cache_dir = Path(fs.get_project_root_dir()) / ".magnitude" fs.mkdir_if_not_exists(cache_dir) embed_file = self._embedding_type.url[self._embedding_type.url. rfind("/") + 1:] compressed_file = Path(cache_dir) / embed_file if not compressed_file.exists(): logger.info( ' Downloading magnitude file ("{}")...'.format(embed_file)) wget.download(self._embedding_type.url, compressed_file) self._embed_file = compressed_file logger.info(' Loading Magnitude module...') self._magnitude_vecs = Magnitude(self._embed_file) def build(self, vocab_list: List[str], **kwargs) -> (List[str], List[str], Dict[str, Dict[str, Any]]): oov, iov = [], [] vec_dict = {} for w in vocab_list: is_oov = w not in self._magnitude_vecs vec = self._magnitude_vecs.query(w) vec_dict[w] = {"vec": vec, "trainable": is_oov} if is_oov: oov.append(w) else: iov.append(w) return oov, iov, vec_dict
class MagnitudeFactory(EmbeddingFactory): def __init__(self, embedding_type: EmbeddingType): super().__init__(embedding_type) cache_dir = Path(fs.get_project_root_dir()) / ".magnitude" fs.mkdir_if_not_exists(cache_dir) embed_file = self._embedding_type.url[self._embedding_type.url. rfind("/") + 1:] compressed_file = Path(cache_dir) / embed_file if not compressed_file.exists(): logger.info( ' Downloading magnitude file ("{}")...'.format(embed_file)) wget.download(self._embedding_type.url, compressed_file) self._embed_file = compressed_file logger.info(' Loading Magnitude module...') self._magnitude_vecs = Magnitude(self._embed_file) def build(self, vocab_list: List[str], h5_file: Path, **kwargs) -> (List[str], List[str]): oov, iov = [], [] with h5py.File(h5_file, mode="w") as vec_h5: for w in vocab_list: is_oov = w not in self._magnitude_vecs vec = self._magnitude_vecs.query(w) vec_h5.create_dataset("{key}/vec".format(key=w), data=vec) vec_h5.create_dataset("{key}/trainable".format(key=w), data=1 if is_oov else 0) if is_oov: oov.append(w) else: iov.append(w) return oov, iov
def glove_via_magnitude(topn=500, min_similarity=None, filename='glove.6B.100d.magnitude', lang='en_US'): from pymagnitude import Magnitude v = Magnitude(os.path.join(TOPDIR, filename)) training_set = list() units = set() for unit_list in classifier.ambiguous_units(): for unit in unit_list[1]: units.add(unit) for unit in units: print('Processing {}...'.format(unit.name)) name = unit.name surfaces = set(unit.name) if isinstance(unit, classes.Unit): surfaces.update(unit.surfaces) surfaces.update(unit.symbols) for surface in surfaces: neighbours = v.most_similar( v.query(surface), topn=topn, min_similarity=min_similarity) training_set.append({ 'unit': name, 'text': ' '.join(neighbour[0] for neighbour in neighbours) }) print('Done') with language.topdir(lang).joinpath('train/similars.json').open( 'w', encoding='utf-8') as file: json.dump(training_set, file, sort_keys=True, indent=4)
def get_word_vector(word): global model if model is None: # import fasttext # if os.environ.get('LANGUAGE', 'en').lower() == 'en': # print('Loading English word vectors') # model = fasttext.load_model('data/cc.en.300.bin') # else: # print('Loading Vietnamese word vectors') # model = fasttext.load_model('data/cc.vi.300.bin') # return model.get_word_vector(word.replace(' ', '_')) from pymagnitude import Magnitude if os.environ.get('LANGUAGE', 'en').lower() == 'en': print('Loading English word vectors') model = Magnitude('data/cc.en.300.magnitude', language='en', lazy_loading=20000) else: print('Loading Vietnamese word vectors') model = Magnitude('data/cc.vi.300.magnitude', language='vi', lazy_loading=20000) print('Loading completed') return model.query(word)
class PyMagnitudeField(Field): def __init__(self, magnitude_vector_filepath, sequential=True, lower=True, tokenize=(lambda s: s.split()), include_lengths=True, batch_first=True, **kwargs): if kwargs.get('use_vocab'): kwargs['use_vocab'] = False self.vectors = Magnitude(magnitude_vector_filepath) super(PyMagnitudeField, self).__init__(sequential=sequential, lower=lower, tokenize=tokenize, include_lengths=include_lengths, batch_first=batch_first, **kwargs) def build_vocab(self, *args, **kwargs): pass def process(self, batch, device, train): if self.include_lengths: batch = (batch, [len(x) for x in batch]) return self.numericalize(batch, device=device, train=train) def numericalize(self, arr, device=torch.device('cpu'), train=True): if self.include_lengths and not isinstance(arr, tuple): raise ValueError('Field has include_lengths set to True, but ' 'input data is not a tuple of ' '(data batch, batch lengths).') if isinstance(arr, tuple): arr, lengths = arr lengths = torch.LongTensor(lengths) arr = torch.from_numpy(self.vectors.query(arr)) if self.sequential and not self.batch_first: arr.t_() if device.type == 'cpu': if self.sequential: arr = arr.contiguous() else: arr = arr.cuda(device) if self.include_lengths: lengths = lengths.cuda(device) arr.requires_grad = False if self.include_lengths: return arr, lengths return arr
def build_vectors_file(vec_model: Magnitude, words: List[str], out_file="ENT.vec"): print(f"Building {out_file}") with open(out_file, "w") as out: out.write(f"{len(words)} {vec_model.dim}\n") for e in words: v = vec_model.query(e) str_vec = " ".join(map(str, v)) # ndarray to str line = f"{e} {str_vec}\n" out.write(line)
def test_embedtext_creation(): extractor_cfg = { "_name": "embedtext", "index": "anserini", "tokenizer": "anserini", "embeddings": "glove6b", "zerounk": True, "calcidf": True, "maxqlen": MAXQLEN, "maxdoclen": MAXDOCLEN, } extractor = EmbedText(extractor_cfg) benchmark = DummyBenchmark({"_fold": "s1", "rundocsonly": False}) collection = DummyCollection({"_name": "dummy"}) index_cfg = {"_name": "anserini", "indexstops": False, "stemmer": "porter"} index = AnseriniIndex(index_cfg) index.modules["collection"] = collection tok_cfg = {"_name": "anserini", "keepstops": True, "stemmer": "none"} tokenizer = AnseriniTokenizer(tok_cfg) extractor.modules["index"] = index extractor.modules["tokenizer"] = tokenizer qids = list(benchmark.qrels.keys()) # ["301"] qid = qids[0] docids = list(benchmark.qrels[qid].keys()) extractor.create(qids, docids, benchmark.topics[benchmark.query_type]) expected_vocabs = [ "lessdummy", "dummy", "doc", "hello", "greetings", "world", "from", "outer", "space", "<pad>" ] expected_stoi = {s: i for i, s in enumerate(expected_vocabs)} assert set(extractor.stoi.keys()) == set(expected_stoi.keys()) emb_path = "glove/light/glove.6B.300d" fullemb = Magnitude(MagnitudeUtils.download_model(emb_path)) assert extractor.embeddings.shape == (len(expected_vocabs), fullemb.dim) for i in range(extractor.embeddings.shape[0]): if i == extractor.pad: assert extractor.embeddings[i].sum() < 1e-5 continue s = extractor.itos[i] assert (extractor.embeddings[i] - fullemb.query(s)).sum() < 1e-5 return extractor
def read_magnitude_vectors(magnitude_filepath, vocab_filepath, vocab_size, dim, special_tokens=[UNK]): """Read word vectors from *.magnitude Args: magnitude_filepath (str): magnituide file path vocab_filepath (str): vocabulary file path vocab_size (int): Maximum vocab size (including special tokens) dim (int): Dimension of the word vectors to load special_tokens (list[str]) Return: words (list[str]): list of length vocab_size embeddings (np.array): (vocab_size, dim) """ logging.info('Loading word vectors from %s', magnitude_filepath) words = [x for x in special_tokens] word_set = set() with open(vocab_filepath, 'r', 'utf8') as fin: for line in fin: word = line.strip().split("\t")[0] if word in word_set: logging.warning( "token must be unique. non-unique token='{}'".format(word)) elif len(word) > 0: word_set.add(word) words.append(word) if len(words) == vocab_size: break magnitude = Magnitude(magnitude_filepath, case_insensitive=True, normalized=True) vectors = magnitude.query(words[len(special_tokens):]) # special vectors for UNK special_vectors = np.random.normal(size=(len(special_tokens), dim)) special_vectors /= np.linalg.norm(special_vectors, ord=2, axis=1, keepdims=True) # Concatenate vectors = np.vstack([special_vectors, vectors]).astype('float32') assert vectors.shape[0] == len(words) assert vectors.shape[1] == dim logging.info('Loaded %d word vectors; shape = %s', len(words), str(vectors.shape)) return words, vectors
def _build_matrix(self, tokenizer): vector = Magnitude('vectors/glove.6B.50d.magnitude') GLOVE_VECTOR_DIMENSION = 50 MAX_NUM_WORDS = 300 word_index = tokenizer.word_index num_words = min(MAX_NUM_WORDS, len(word_index)) + 1 embedding_matrix = np.zeros((num_words, GLOVE_VECTOR_DIMENSION)) for word, i in tqdm(word_index.items()): if i > MAX_NUM_WORDS: continue embedding_vector = vector.query(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector return embedding_matrix
class WordVectorFeature(FeatureExtractor): def __init__(self, vectors_path: str, scaling: float = 1.0) -> None: self.vectors = Magnitude(vectors_path, normalized=False) self.scaling = scaling def extract( self, token: str, current_idx: int, relative_idx: int, tokens: Sequence[str], features: Dict[str, float], ) -> None: vector = self.vectors.query(token) keys = ["v" + str(i) for i in range(len(vector))] values = vector * self.scaling features.update(zip(keys, values))
def avg_feature_vector(sentence, num_features = 300): """ Generates Word2Vec embeddings for a text. :param sentence: text to generate embeddings for. :param num_features: feature vector length :return: embeddings feature vector """ vectors = Magnitude('models/GoogleNews-vectors-negative300.magnitude') words = sentence.split() feature_vec = np.zeros((num_features, ), dtype='float32') n_words = 0 for word in words: feature_vec = vectors.query(word) n_words += 1 if (n_words > 0): feature_vec = np.divide(feature_vec, n_words) return feature_vec
class WordVectorFeature(FeatureExtractor): def __init__(self, vectors_path: str, scaling: float = 1.0) -> None: self.scaling = scaling self.vectors = Magnitude(vectors_path, normalized=False) self.keys = ['v'+repr(i) for i in range(self.vectors.dim)] def extract( self, token: str, current_idx: int, relative_idx: int, tokens: Sequence[str], features: Dict[str, float], idx: int, ) -> None: if relative_idx == 0: v = self.scaling * self.vectors.query(token) features.update(zip(self.keys,v))
def get_pretrained_word_embed(self, vocab, magnitude='GoogleNews-vectors-negative300.magnitude', verbose=1): """ loads word embeddings from magnitude into pytorch Return: ----- pretrained_word_embed: torch.Tensor of size (Vocab x word embedding size) unknown_words: list of unknown words that are not in the prefit embeddings """ magnitude = Magnitude(magnitude) pretrained_word_embed = [torch.from_numpy(magnitude.query(word)).type(torch.cuda.FloatTensor).view(1,-1) for word in vocab] pretrained_word_embed = torch.cat(pretrained_word_embed, axis=0) self.unknown_words = [word for word in vocab if word not in magnitude] n_unknown_words = len(self.unknown_words) if verbose: print(f"Unknown words: {n_unknown_words} / {len(vocab)}") return pretrained_word_embed
class WordVectorFeature(FeatureExtractor): def __init__(self, vectors_path: str, scaling: float = 1.0) -> None: self.mag = Magnitude(vectors_path, normalized=False) self.scaling = scaling def extract( self, token: str, current_idx: int, relative_idx: int, tokens: Sequence[str], features: Dict[str, float], ) -> None: # let's just write the suggested lines right here, and see how much they hook up if relative_idx == 0: values = self.mag.query( token) * self.scaling # Scaling matters lol #print(token, current_idx, tokens, values) keys = ["v" + str(i) for i in range(len(values))] features.update(zip(keys, values))
class WordVectorFeature(FeatureExtractor): def __init__(self, vectors_path: str, scaling: float = 1.0) -> None: self.magnitude_word_vector = Magnitude(vectors_path, normalized=False) self.scale = scaling def extract( self, token: str, current_idx: int, relative_idx: int, tokens: Sequence[str], features: Dict[str, float], ) -> None: if relative_idx == 0: word_vector = self.magnitude_word_vector.query(token) scaled_word_vector = word_vector * self.scale keys = [] for i in range(len(scaled_word_vector)): keys.append("v" + str(i)) features.update(zip(keys, scaled_word_vector)) return features
def preprocess(file_name, m, word2vec_magnitude_file): raw_data = open(file_name) X_pickle_file = open(file_name[:-4] + "_X.p", 'wb') y_pickle_file = open(file_name[:-4] + "_y.p", 'wb') comments = [] y = [] for line in raw_data: y.append(int(line.split('\t')[0])) line = line.split('\t')[1] line = re.sub(r'\d+', '', line) tokens = word_tokenize(line) comments.append(tokens) #max_len = len(max(comments,key=len)) #denote n max_len = 100 # matrix m x n x d, m = sample, n = max_len of sentence, d = word vector size # TODO: If time allows, train on our data. For now, we just Google's pretrained model vector_size = 300 #model = gensim.models.Word2Vec(comments, size=vector_size, window=8, min_count=1, workers=10) #model.train(comments, total_examples=len(comments), epochs=10) #TODO: Use FastText to handle unseen words, also Magnitude! model = Magnitude(word2vec_magnitude_file) X = zeros([m, max_len, vector_size]) #start = time.time() for comment_idx, sentence in enumerate(comments): for sentence_idx, word in enumerate(sentence): if (sentence_idx == max_len - 1): break X[comment_idx][sentence_idx] = model.query(word) #if(comment_idx % 250 == 0): # print(comment_idx) # print(time_since(start)) pickle.dump(X, X_pickle_file) pickle.dump(y, y_pickle_file) X_pickle_file.close() y_pickle_file.close()
class MagnitudeTest(unittest.TestCase): MAGNITUDE_PATH = "" MAGNITUDE_SUBWORD_PATH = "" MAGNITUDE_APPROX_PATH = "" def setUp(self): self.vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=True) self.vectors_cs = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=False, eager=False) self.vectors_sw = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, eager=False) self.vectors_approx = Magnitude(MagnitudeTest.MAGNITUDE_APPROX_PATH, case_insensitive=True, eager=False) self.tmp_vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=False) self.concat_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=False) self.concat_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=False) self.concat = Magnitude(self.concat_1, self.concat_2) self.vectors_feat = FeaturizerMagnitude(100, case_insensitive=True) self.v = { 'padding': self.tmp_vectors._padding_vector(), 'I': self.tmp_vectors.query("I"), 'saw': self.tmp_vectors.query("saw"), 'a': self.tmp_vectors.query("a"), 'cat': self.tmp_vectors.query("cat"), 'He': self.tmp_vectors.query("He"), 'went': self.tmp_vectors.query("went"), 'to': self.tmp_vectors.query("to"), 'the': self.tmp_vectors.query("the"), 'mall': self.tmp_vectors.query("mall"), 'blah123': self.tmp_vectors.query("blah123") } def tearDown(self): self.vectors.close() self.vectors_cs.close() self.vectors_sw.close() self.tmp_vectors.close() self.concat_1.close() self.concat_2.close() del self.concat self.vectors_feat.close() gc.collect() def test_length(self): self.assertEqual(len(self.vectors), 3000000) def test_dim(self): self.assertEqual(self.vectors.dim, 300) def test_index(self): self.assertTrue(isinstance(self.vectors[0][0], unicode)) self.assertTrue(isinstance(self.vectors[0][1], np.ndarray)) self.assertTrue(isinstance(self.vectors.index(0)[0], unicode)) self.assertTrue(isinstance(self.vectors.index(0)[1], np.ndarray)) self.assertTrue( isinstance(self.vectors.index(0, return_vector=False), unicode)) def test_slice(self): sliced = self.vectors[0:5] self.assertEqual(len(sliced), 5) self.assertEqual(sliced[0][0], self.vectors[0][0]) self.assertTrue(isclose(sliced[0][1], self.vectors[0][1]).all()) def test_case_insensitive(self): some_keys_are_not_lower = False for i, (k, _) in enumerate(self.vectors): if i > 1000: break some_keys_are_not_lower = (some_keys_are_not_lower or k.lower() != k) self.assertTrue(some_keys_are_not_lower) self.assertTrue("QuEEn" in self.vectors) self.assertTrue("QUEEN" in self.vectors) self.assertTrue("queen" in self.vectors) self.assertTrue( isclose(self.vectors.query("Queen"), self.vectors.query("QuEEn")).all()) self.assertEqual( self.vectors.most_similar("I", return_similarities=False)[0], 'myself') self.assertEqual( self.vectors.most_similar("i", return_similarities=False)[0], 'ive') self.assertTrue(self.vectors.similarity("a", "A") > .9) def test_case_sensitive(self): some_keys_are_not_lower = False for i, (k, _) in enumerate(self.vectors_cs): if i > 1000: break some_keys_are_not_lower = (some_keys_are_not_lower or k.lower() != k) self.assertTrue(some_keys_are_not_lower) self.assertTrue("QuEEn" not in self.vectors_cs) self.assertTrue("QUEEN" in self.vectors_cs) self.assertTrue("queen" in self.vectors_cs) self.assertTrue(not isclose(self.vectors_cs.query("Queen"), self.vectors_cs.query("QuEEn")).all()) self.assertEqual( self.vectors_cs.most_similar("I", return_similarities=False)[0], 'myself') self.assertEqual( self.vectors_cs.most_similar("i", return_similarities=False)[0], 'ive') self.assertTrue(self.vectors_cs.similarity("a", "A") > .9) def test_iter_case_insensitive(self): for _ in range(2): for i, (k, v) in enumerate(self.vectors): if i > 1000: break k2, v2 = self.vectors[i] self.assertEqual(k, k2) self.assertTrue(isclose(v[0], v2[0])) def test_iter_case_sensitive(self): for _ in range(2): for i, (k, v) in enumerate(self.vectors_cs): if i > 1000: break k2, v2 = self.vectors_cs[i] self.assertEqual(k, k2) self.assertTrue(isclose(v[0], v2[0])) def test_index_case_insensitive(self): for _ in range(2): viter = iter(self.vectors) for i in range(len(self.vectors)): if i > 1000: break k, v = next(viter) k2, v2 = self.vectors[i] self.assertEqual(k, k2) self.assertTrue(isclose(v[0], v2[0])) def test_index_case_sensitive(self): for _ in range(2): viter = iter(self.vectors_cs) for i in range(len(self.vectors_cs)): if i > 1000: break k, v = next(viter) k2, v2 = self.vectors_cs[i] self.assertEqual(k, k2) self.assertTrue(isclose(v[0], v2[0])) def test_bounds(self): length = len(self.vectors) self.assertTrue(isinstance(self.vectors[length - 1][0], unicode)) self.assertTrue(isinstance(self.vectors[length - 1][1], np.ndarray)) @unittest.expectedFailure def test_out_of_bounds(self): length = len(self.vectors) self.assertTrue(isinstance(self.vectors[length][0], unicode)) self.assertTrue(isinstance(self.vectors[length][1], np.ndarray)) def test_contains(self): self.assertTrue("cat" in self.vectors) def test_contains_false(self): self.assertTrue("blah123" not in self.vectors) def test_special_characters(self): self.assertTrue("Wilkes-Barre/Scranton" in self.vectors) self.assertTrue("out-of-vocabulary" not in self.vectors) self.assertTrue('quotation"s' not in self.vectors) self.assertTrue("quotation's" not in self.vectors) self.assertTrue("colon;s" not in self.vectors) self.assertTrue("sh**" not in self.vectors) self.assertTrue("'s" not in self.vectors_cs) self.assertTrue('"s' not in self.vectors) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query("Wilkes-Barre/Scranton").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query("out-of-vocabulary").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query('quotation"s').shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query("quotation's").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query("colon;s").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query("sh**").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors_cs.query("'s").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query('"s').shape) def test_oov_dim(self): self.assertEqual( self.vectors.query("*<<<<").shape, self.vectors.query("cat").shape) def test_oov_subword_dim(self): self.assertEqual( self.vectors_sw.query("*<<<<").shape, self.vectors_sw.query("cat").shape) def test_oov_dim_placeholders(self): self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH, placeholders=5, case_insensitive=True, eager=False) self.assertEqual( self.vectors_placeholders.query("*<<<<").shape, self.vectors_placeholders.query("cat").shape) self.assertTrue( isclose( self.vectors.query("*<<<<")[0], self.vectors_placeholders.query("*<<<<")[0])) self.vectors_placeholders.close() def test_oov_subword_dim_placeholders(self): self.vectors_placeholders = Magnitude( MagnitudeTest.MAGNITUDE_SUBWORD_PATH, placeholders=5, case_insensitive=True, eager=False) self.assertEqual( self.vectors_placeholders.query("*<<<<").shape, self.vectors_placeholders.query("cat").shape) self.assertTrue( isclose( self.vectors.query("*<<<<")[0], self.vectors_placeholders.query("*<<<<")[0])) self.vectors_placeholders.close() def test_oov_unit_norm(self): self.assertTrue( isclose(np.linalg.norm(self.vectors.query("*<<<<<")), 1.0)) def test_oov_subword_unit_norm(self): self.assertTrue( isclose(np.linalg.norm(self.vectors_sw.query("*<<<<<")), 1.0)) def test_ngram_oov_closeness(self): self.assertTrue(self.vectors.similarity("uberx", "uberxl") > .7) self.assertTrue(self.vectors.similarity("uberx", "veryrandom") < .7) self.assertTrue( self.vectors.similarity("veryrandom", "veryrandom") > .7) def test_ngram_oov_subword_closeness(self): self.assertTrue(self.vectors_sw.similarity("uberx", "uberxl") > .7) self.assertTrue(self.vectors_sw.similarity("uberx", "uber") > .7) self.assertTrue(self.vectors_sw.similarity("uberxl", "uber") > .7) self.assertTrue( self.vectors_sw.similarity("discriminatoryy", "discriminatory") > .7) self.assertTrue( self.vectors_sw.similarity("discriminatoryy", "discriminnatory") > .8) self.assertTrue(self.vectors_sw.similarity("uberx", "veryrandom") < .7) self.assertTrue( self.vectors_sw.similarity("veryrandom", "veryrandom") > .7) self.assertTrue(self.vectors_sw.similarity("hiiiiiiiii", "hi") > .7) self.assertTrue(self.vectors_sw.similarity("heeeeeeeey", "hey") > .7) self.assertTrue(self.vectors_sw.similarity("heyyyyyyyyyy", "hey") > .7) self.assertTrue(self.vectors_sw.similarity("faaaaaate", "fate") > .65) def test_oov_values(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.assertTrue( isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<")[0], -0.0372075283555)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<")[0], -0.0201727917272)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<")[0], -0.0475993225776)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<<")[0], 0.0129938352266)) self.assertTrue( isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<<<")[0], -0.0372075283555)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<<<<")[0], -0.0201727917272)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<<<<<")[0], -0.0475993225776)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<<<<<<")[0], 0.0129938352266)) self.vectors_oov_1.close() self.vectors_oov_2.close() def test_oov_subword_values(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.assertTrue( isclose( self.vectors_oov_1.query("discriminatoryy")[0], -0.0573252095591)) self.assertTrue( isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271)) self.assertTrue( isclose(self.vectors_oov_1.query("uberx")[0], 0.0952671681336)) self.assertTrue( isclose(self.vectors_oov_1.query("misssipi")[0], 0.0577835297955)) self.assertTrue( isclose( self.vectors_oov_2.query("discriminatoryy")[0], -0.0573252095591)) self.assertTrue( isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271)) self.assertTrue( isclose(self.vectors_oov_2.query("uberx")[0], 0.0952671681336)) self.assertTrue( isclose(self.vectors_oov_2.query("misssipi")[0], 0.0577835297955)) self.vectors_oov_1.close() self.vectors_oov_2.close() def test_oov_stability(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=False, eager=False) for i in range(5): self.assertTrue( isclose(self.vectors_oov_1.query("*<"), self.vectors_oov_2.query("*<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<"), self.vectors_oov_2.query("*<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<"), self.vectors_oov_2.query("*<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<"), self.vectors_oov_2.query("*<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<"), self.vectors_oov_2.query("*<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<"), self.vectors_oov_2.query("*<<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<<"), self.vectors_oov_2.query("*<<<<<<<")).all()) self.vectors_oov_1.close() self.vectors_oov_2.close() def test_ngram_oov_stability(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=True, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=True, eager=False) for i in range(5): self.assertTrue( isclose(self.vectors_oov_1.query("*<"), self.vectors_oov_2.query("*<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<"), self.vectors_oov_2.query("*<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<"), self.vectors_oov_2.query("*<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<"), self.vectors_oov_2.query("*<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<"), self.vectors_oov_2.query("*<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<"), self.vectors_oov_2.query("*<<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<<"), self.vectors_oov_2.query("*<<<<<<<")).all()) self.vectors_oov_1.close() self.vectors_oov_2.close() def test_ngram_oov_subword_stability(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, eager=False) for i in range(5): self.assertTrue( isclose(self.vectors_oov_1.query("*<"), self.vectors_oov_2.query("*<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<"), self.vectors_oov_2.query("*<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<"), self.vectors_oov_2.query("*<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<"), self.vectors_oov_2.query("*<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<"), self.vectors_oov_2.query("*<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<"), self.vectors_oov_2.query("*<<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<<"), self.vectors_oov_2.query("*<<<<<<<")).all()) self.vectors_oov_1.close() self.vectors_oov_2.close() def test_placeholders(self): self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, placeholders=5, eager=False) self.assertEqual(self.vectors_placeholders.query("cat").shape, (305, )) self.assertEqual( self.vectors_placeholders.query("cat")[0], self.vectors.query("cat")[0]) self.vectors_placeholders.close() def test_numpy(self): self.assertTrue(isinstance(self.vectors.query("cat"), np.ndarray)) def test_list(self): self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, use_numpy=False, eager=False) self.assertTrue(isinstance(self.vectors_list.query("cat"), list)) self.vectors_list.close() def test_repeated_single(self): q = "cat" result = self.vectors.query(q) result_2 = self.vectors.query(q) self.assertTrue(isclose(result, result_2).all()) def test_repeated_multiple(self): q = ["I", "saw", "a", "cat"] result = self.vectors.query(q) result_2 = self.vectors.query(q) self.assertTrue(isclose(result, result_2).all()) q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q) result_2 = self.vectors.query(q) self.assertTrue(isclose(result, result_2).all()) def test_multiple(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q) self.assertEqual(result.shape, (2, 5, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['I']).all()) self.assertTrue(isclose(result[0][1], self.v['saw']).all()) self.assertTrue(isclose(result[0][2], self.v['a']).all()) self.assertTrue(isclose(result[0][3], self.v['cat']).all()) self.assertTrue(isclose(result[0][4], self.v['padding']).all()) self.assertTrue(isclose(result[1][0], self.v['He']).all()) self.assertTrue(isclose(result[1][1], self.v['went']).all()) self.assertTrue(isclose(result[1][2], self.v['to']).all()) self.assertTrue(isclose(result[1][3], self.v['the']).all()) self.assertTrue(isclose(result[1][4], self.v['mall']).all()) return result def test_pad_to_length_right_truncate_none(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q, pad_to_length=6) self.assertEqual(result.shape, (2, 6, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['I']).all()) self.assertTrue(isclose(result[0][1], self.v['saw']).all()) self.assertTrue(isclose(result[0][2], self.v['a']).all()) self.assertTrue(isclose(result[0][3], self.v['cat']).all()) self.assertTrue(isclose(result[0][4], self.v['padding']).all()) self.assertTrue(isclose(result[0][5], self.v['padding']).all()) self.assertTrue(isclose(result[1][0], self.v['He']).all()) self.assertTrue(isclose(result[1][1], self.v['went']).all()) self.assertTrue(isclose(result[1][2], self.v['to']).all()) self.assertTrue(isclose(result[1][3], self.v['the']).all()) self.assertTrue(isclose(result[1][4], self.v['mall']).all()) self.assertTrue(isclose(result[1][5], self.v['padding']).all()) return result def test_pad_to_length_truncate_none(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q, pad_to_length=6) self.assertEqual(result.shape, (2, 6, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['I']).all()) self.assertTrue(isclose(result[0][1], self.v['saw']).all()) self.assertTrue(isclose(result[0][2], self.v['a']).all()) self.assertTrue(isclose(result[0][3], self.v['cat']).all()) self.assertTrue(isclose(result[0][4], self.v['padding']).all()) self.assertTrue(isclose(result[0][5], self.v['padding']).all()) self.assertTrue(isclose(result[1][0], self.v['He']).all()) self.assertTrue(isclose(result[1][1], self.v['went']).all()) self.assertTrue(isclose(result[1][2], self.v['to']).all()) self.assertTrue(isclose(result[1][3], self.v['the']).all()) self.assertTrue(isclose(result[1][4], self.v['mall']).all()) self.assertTrue(isclose(result[1][5], self.v['padding']).all()) return result def test_pad_to_length_left_truncate_none(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q, pad_to_length=6, pad_left=True) self.assertEqual(result.shape, (2, 6, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['padding']).all()) self.assertTrue(isclose(result[0][1], self.v['padding']).all()) self.assertTrue(isclose(result[0][2], self.v['I']).all()) self.assertTrue(isclose(result[0][3], self.v['saw']).all()) self.assertTrue(isclose(result[0][4], self.v['a']).all()) self.assertTrue(isclose(result[0][5], self.v['cat']).all()) self.assertTrue(isclose(result[1][0], self.v['padding']).all()) self.assertTrue(isclose(result[1][1], self.v['He']).all()) self.assertTrue(isclose(result[1][2], self.v['went']).all()) self.assertTrue(isclose(result[1][3], self.v['to']).all()) self.assertTrue(isclose(result[1][4], self.v['the']).all()) self.assertTrue(isclose(result[1][5], self.v['mall']).all()) return result def test_pad_to_length_truncate_right(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q, pad_to_length=3) self.assertEqual(result.shape, (2, 3, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['I']).all()) self.assertTrue(isclose(result[0][1], self.v['saw']).all()) self.assertTrue(isclose(result[0][2], self.v['a']).all()) self.assertTrue(isclose(result[1][0], self.v['He']).all()) self.assertTrue(isclose(result[1][1], self.v['went']).all()) self.assertTrue(isclose(result[1][2], self.v['to']).all()) return result def test_pad_to_length_truncate_left(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q, pad_to_length=3, truncate_left=True) self.assertEqual(result.shape, (2, 3, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['saw']).all()) self.assertTrue(isclose(result[0][1], self.v['a']).all()) self.assertTrue(isclose(result[0][2], self.v['cat']).all()) self.assertTrue(isclose(result[1][0], self.v['to']).all()) self.assertTrue(isclose(result[1][1], self.v['the']).all()) self.assertTrue(isclose(result[1][2], self.v['mall']).all()) return result def test_list_multiple(self): self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, use_numpy=False, eager=False) q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] self.assertTrue(isinstance(self.vectors_list.query(q[0]), list)) self.assertTrue( isclose(self.vectors.query(q[0]), asarray(self.vectors_list.query(q[0]))).all()) self.assertTrue(isinstance(self.vectors_list.query(q), list)) self.assertTrue( isclose(self.vectors.query(q), asarray(self.vectors_list.query(q))).all()) self.vectors_list.close() def test_concat(self): q = "cat" result = self.concat.query(q) self.assertEqual(result.shape, (self.vectors.dim * 2, )) self.assertTrue(isclose(result[0:300], self.v['cat']).all()) self.assertTrue(isclose(result[300:600], self.v['cat']).all()) def test_concat_multiple(self): q = ["I", "saw"] result = self.concat.query(q) self.assertEqual(result.shape, ( 2, self.vectors.dim * 2, )) self.assertTrue(isclose(result[0][0:300], self.v['I']).all()) self.assertTrue(isclose(result[0][300:600], self.v['I']).all()) self.assertTrue(isclose(result[1][0:300], self.v['saw']).all()) self.assertTrue(isclose(result[1][300:600], self.v['saw']).all()) def test_concat_multiple_2(self): q = [["I", "saw"], ["He", "went"]] result = self.concat.query(q) self.assertEqual(result.shape, ( 2, 2, self.vectors.dim * 2, )) self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all()) self.assertTrue(isclose(result[0][0][300:600], self.v['I']).all()) self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all()) self.assertTrue(isclose(result[0][1][300:600], self.v['saw']).all()) self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all()) self.assertTrue(isclose(result[1][0][300:600], self.v['He']).all()) self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all()) self.assertTrue(isclose(result[1][1][300:600], self.v['went']).all()) def test_concat_specific(self): q = ("cat", "mall") result = self.concat.query(q) self.assertEqual(result.shape, (self.vectors.dim * 2, )) self.assertTrue(isclose(result[0:300], self.v['cat']).all()) self.assertTrue(isclose(result[300:600], self.v['mall']).all()) def test_concat_multiple_specific(self): q = [("I", "He"), ("saw", "went")] result = self.concat.query(q) self.assertEqual(result.shape, ( 2, self.vectors.dim * 2, )) self.assertTrue(isclose(result[0][0:300], self.v['I']).all()) self.assertTrue(isclose(result[0][300:600], self.v['He']).all()) self.assertTrue(isclose(result[1][0:300], self.v['saw']).all()) self.assertTrue(isclose(result[1][300:600], self.v['went']).all()) def test_concat_multiple_2_specific(self): q = [[("I", "He"), ("saw", "went")], [("He", "I"), ("went", "saw")]] result = self.concat.query(q) self.assertEqual(result.shape, ( 2, 2, self.vectors.dim * 2, )) self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all()) self.assertTrue(isclose(result[0][0][300:600], self.v['He']).all()) self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all()) self.assertTrue(isclose(result[0][1][300:600], self.v['went']).all()) self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all()) self.assertTrue(isclose(result[1][0][300:600], self.v['I']).all()) self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all()) self.assertTrue(isclose(result[1][1][300:600], self.v['saw']).all()) def test_distance(self): self.assertTrue( isclose(self.vectors.distance("cat", "dog"), 0.69145405)) def test_distance_multiple(self): self.assertTrue( isclose(self.vectors.distance("cat", ["cats", "dog"]), [0.61654216, 0.69145405]).all()) def test_similarity(self): self.assertTrue( isclose(self.vectors.similarity("cat", "dog"), 0.7609457089782209)) def test_similarity_multiple(self): self.assertTrue( isclose(self.vectors.similarity("cat", ["cats", "dog"]), [0.8099378824686305, 0.7609457089782209]).all()) def test_most_similar_to_given(self): self.assertEqual( self.vectors.most_similar_to_given( "cat", ["dog", "television", "laptop"]), "dog") self.assertEqual( self.vectors.most_similar_to_given( "cat", ["television", "dog", "laptop"]), "dog") self.assertEqual( self.vectors.most_similar_to_given( "cat", ["television", "laptop", "dog"]), "dog") def test_doesnt_match(self): self.assertEqual( self.vectors.doesnt_match( ["breakfast", "cereal", "lunch", "dinner"]), "cereal") self.assertEqual( self.vectors.doesnt_match( ["breakfast", "lunch", "cereal", "dinner"]), "cereal") self.assertEqual( self.vectors.doesnt_match( ["breakfast", "lunch", "dinner", "cereal"]), "cereal") def test_most_similar_case_insensitive(self): keys = [s[0] for s in self.vectors.most_similar("queen", topn=5)] similarities = [ s[1] for s in self.vectors.most_similar("queen", topn=5) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.7399442791938782, 0.7070531845092773, 0.6510956287384033, 0.6383601427078247, 0.6357027292251587 ]), atol=.02).all()) self.assertEqual(keys, [ u'queens', u'princess', u'king', u'monarch', u'very_pampered_McElhatton' ]) def test_most_similar(self): keys = [s[0] for s in self.vectors_cs.most_similar("queen")] similarities = [s[1] for s in self.vectors_cs.most_similar("queen")] self.assertTrue( isclose(asarray(similarities), asarray([ 0.7399442791938782, 0.7070531845092773, 0.6510956287384033, 0.6383601427078247, 0.6357027292251587, 0.6163408160209656, 0.6060680150985718, 0.5923796892166138, 0.5908075571060181, 0.5637184381484985 ]), atol=.02).all()) self.assertEqual(keys, [ u'queens', u'princess', u'king', u'monarch', u'very_pampered_McElhatton', u'Queen', u'NYC_anglophiles_aflutter', u'Queen_Consort', u'princesses', u'royal', ]) def test_most_similar_no_similarities(self): keys = self.vectors_cs.most_similar("queen", return_similarities=False) self.assertEqual(keys, [ u'queens', u'princess', u'king', u'monarch', u'very_pampered_McElhatton', u'Queen', u'NYC_anglophiles_aflutter', u'Queen_Consort', u'princesses', u'royal', ]) def test_most_similar_top_5(self): keys = [s[0] for s in self.vectors_cs.most_similar("queen", topn=5)] similarities = [ s[1] for s in self.vectors_cs.most_similar("queen", topn=5) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.7399442791938782, 0.7070531845092773, 0.6510956287384033, 0.6383601427078247, 0.6357027292251587 ]), atol=.02).all()) self.assertEqual(keys, [ u'queens', u'princess', u'king', u'monarch', u'very_pampered_McElhatton' ]) def test_most_similar_min_similarity(self): keys = [ s[0] for s in self.vectors_cs.most_similar("queen", min_similarity=.63) ] similarities = [ s[1] for s in self.vectors_cs.most_similar("queen", min_similarity=.63) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.7399442791938782, 0.7070531845092773, 0.6510956287384033, 0.6383601427078247, 0.6357027292251587 ]), atol=.02).all()) self.assertEqual(keys, [ u'queens', u'princess', u'king', u'monarch', u'very_pampered_McElhatton' ]) def test_most_similar_analogy(self): keys = [ s[0] for s in self.vectors_cs.most_similar(positive=["king", "woman"], negative=["man"]) ] similarities = [ s[1] for s in self.vectors_cs.most_similar(positive=["king", "woman"], negative=["man"]) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.7118192315101624, 0.6189674139022827, 0.5902431011199951, 0.549946129322052, 0.5377321243286133, 0.5236844420433044, 0.5235944986343384, 0.518113374710083, 0.5098593831062317, 0.5087411403656006 ]), atol=.02).all()) self.assertEqual(keys, [ u'queen', u'monarch', u'princess', u'crown_prince', u'prince', u'kings', u'Queen_Consort', u'queens', u'sultan', u'monarchy' ]) def test_most_similar_cosmul_analogy(self): keys = [ s[0] for s in self.vectors_cs.most_similar_cosmul( positive=["king", "woman"], negative=["man"]) ] similarities = [ s[1] for s in self.vectors_cs.most_similar_cosmul( positive=["king", "woman"], negative=["man"]) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.9314123392105103, 0.858533501625061, 0.8476565480232239, 0.8150269985198975, 0.809981644153595, 0.8089977502822876, 0.8027306795120239, 0.801961362361908, 0.8009798526763916, 0.7958389520645142 ]), atol=.02).all()) self.assertEqual(keys, [ u'queen', u'monarch', u'princess', u'Queen_Consort', u'queens', u'crown_prince', u'royal_palace', u'monarchy', u'prince', u'empress' ]) def test_most_similar_cosmul_min_similarity_analogy(self): keys = [ s[0] for s in self.vectors_cs.most_similar_cosmul( positive=["king", "woman"], negative=["man"], min_similarity=.81) ] similarities = [ s[1] for s in self.vectors_cs.most_similar_cosmul( positive=["king", "woman"], negative=["man"], min_similarity=.81) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.9314123392105103, 0.858533501625061, 0.8476565480232239, 0.8150269985198975 ]), atol=.02).all()) self.assertEqual(keys, [u'queen', u'monarch', u'princess', u'Queen_Consort']) def test_closer_than(self): self.assertEqual(self.vectors.closer_than("cat", "dog"), ["cats"]) def test_most_similar_approx(self): keys = [ s[0] for s in self.vectors_approx.most_similar_approx("queen", topn=15) ] similarities = [ s[1] for s in self.vectors_approx.most_similar_approx("queen", topn=15) ] self.assertEqual(len(keys), 15) self.assertTrue(similarities[0] > .7 and similarities[-1] > .5) @unittest.expectedFailure def test_most_similar_approx_failure(self): self.vectors.most_similar_approx("queen", topn=15) def test_most_similar_approx_low_effort(self): keys = [ s[0] for s in self.vectors_approx.most_similar_approx( "queen", topn=15, effort=.1) ] self.assertEqual(len(keys), 15) self.assertEqual(keys[0], "princess") def test_most_similar_analogy_approx(self): keys = [ s[0] for s in self.vectors_approx.most_similar_approx( positive=["king", "woman"], negative=["man"], topn=15) ] self.assertEqual(keys[0], "queen") def test_feat_length(self): self.vectors_feat_2 = FeaturizerMagnitude(1000, case_insensitive=True) self.assertEqual(self.vectors_feat.dim, 4) self.assertEqual(self.vectors_feat_2.dim, 5) self.vectors_feat_2.close() def test_feat_stability(self): self.vectors_feat_2 = FeaturizerMagnitude(100, case_insensitive=True) self.assertTrue( isclose(self.vectors_feat.query("VBG"), self.vectors_feat_2.query("VBG")).all()) self.assertTrue( isclose(self.vectors_feat.query("PRP"), self.vectors_feat_2.query("PRP")).all()) self.vectors_feat_2.close() def test_feat_values(self): self.assertTrue( isclose(self.vectors_feat.query("VBG")[0], 0.490634876828)) self.assertTrue( isclose(self.vectors_feat.query("PRP")[0], 0.463890807802)) self.assertTrue(isclose( self.vectors_feat.query(5)[0], -0.750681075834)) self.assertTrue( isclose(self.vectors_feat.query(5)[-1], 1.46936807866e-38))
from pymagnitude import Magnitude vectors = Magnitude('GoogleNews-vectors-negative300.magnitude') cat_vector = vectors.query('cat') print(cat_vector) print(vectors.similarity("cat", "dog")) print(vectors.most_similar("cat", topn=100)) def similarity(word1, word2): return vectors.similarity(word1, word2)
class SemanticSpace(object): def __init__(self, magnitude_path=None): """ :param str magnitude_path: Path to a .pymagnitude embeddings file. """ self.database = magnitude_path if self.database is not None: self.embeddings = Magnitude(self.database) def _embeddings(self, tokens): """ loads a subset of all embeddings into a DataFrame. :param set tokens: set of tokens to get embeddings for :return: Dataframe containing embeddings :rtype: Dataframe """ tokens = list(set(tokens)) vectors = [self.embeddings.query(token) for token in tokens] df = DataFrame(data=vectors, index=tokens) return df def generate2d(self, tokens, method='umap'): """ creates 2d-coordinates for a list of tokens :param list tokens: list of tokens to generate coordinates for :param str method: umap / tsne :return: pandas.Dataframe with x and y coordinates :rtype: pandas.Dataframe """ # load vectors embeddings = self._embeddings(tokens) # if no vectors are loaded if embeddings.empty: return DataFrame() # just in case embeddings = embeddings.dropna() # set up transformer if method == 'tsne': transformer = TSNE(n_components=2, metric='euclidean', perplexity=10., verbose=0) elif method == 'umap': transformer = UMAP() else: raise NotImplementedError('transformation "%s" not supported' % method) # generate 2d coordinates as data frame coordinates = DataFrame(data=transformer.fit_transform(embeddings), index=embeddings.index, columns=['x', 'y']) coordinates.index.name = 'item' # save coordinates self.coordinates = coordinates return coordinates def add(self, item, cutoff=.2): """ Calculate new coordinates for one embedding, based on cosine similarity. :param str item: token to add :param float cutoff: cut-off value for cosine similarity :return: pandas.Series with ['tsne_x', 'tsne_y', 'user_x', 'user_y'] for :rtype: pandas.Series """ # get embedding for items item_embedding = self._embeddings([item])[0] # gather all similar embeddings similarities = [] for base_embedding in self.coordinates.values: similarity = 1 - cosine(item_embedding, base_embedding) if similarity >= cutoff: similarities.append(similarity) else: similarities.append(0) global_similarity_index = sum(similarities) if global_similarity_index == 0: # put in global center new_coordinates = self.coordinates.sum() / len(self.coordinates) else: # weighted average tmp_coordinates = self.coordinates.apply( lambda x: x * similarities) new_coordinates = tmp_coordinates.sum() / global_similarity_index # append new coordinates self.coordinates = self.coordinates.append( DataFrame(data={ 'x': new_coordinates['x'], 'y': new_coordinates['y'], }, index=[item])) return new_coordinates def visualize(self, size, title='semantic map', path="/tmp/vis.html"): """ :param Series size: pd.Series containing label sizes """ output_file(path) print(self.coordinates.join(size).columns) source = ColumnDataSource(self.coordinates.join(size)) p = figure(title=title) p.scatter(x='x', y='y', size=size.name, source=source) p.xaxis[0].axis_label = '' p.yaxis[0].axis_label = '' # coordinate labels = items labels = LabelSet( x='x', y='y', text='item', level='glyph', x_offset=5, # text_font_size=size.name, y_offset=5, source=source, render_mode='canvas') p.add_layout(labels) show(p) print(p)
class EmbeddingHolder: """ A utility class to load a pipeline and cache it in memory """ PAD = "<pad>" SUPPORTED_EMBEDDINGS = { "glove6b": "glove/light/glove.6B.300d", "glove6b.50d": "glove/light/glove.6B.50d", "w2vnews": "word2vec/light/GoogleNews-vectors-negative300", "fasttext": "fasttext/light/wiki-news-300d-1M-subword", } instances = {} def __init__(self, embedding_name): """ If the _is_initialized class property is not set, build the benchmark and model (expensive) Else, do nothing. """ self.embedding_name = embedding_name self.embedding = Magnitude( MagnitudeUtils.download_model( self.SUPPORTED_EMBEDDINGS[embedding_name], download_dir=os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir()) ), lazy_loading=-1, blocking=True, ) self.stoi = {self.PAD: 0} # string to integer. Associates an integer value with every token self.itos = {0: self.PAD} @classmethod def get_instance(cls, embedding_name): if not cls.instances.get(embedding_name): logger.debug("Caching embedding") cls.instances[embedding_name] = EmbeddingHolder(embedding_name) return cls.instances[embedding_name] def get_stoi(self): return self.stoi def get_itos(self): return self.itos def get_nvocab(self): # I have no idea what nvocab is. TODO: Figure this out - DSSM needs this return None def create_indexed_embedding_layer_from_tokens(self, tokens): """ For each token in the list of tokens 1. index = Converts the token into an integer 2. embedding_for_token = Gets the embedding for the token from self.embedding 3. creates a tensor where tensor[index] = embedding_for_token Why do we need to do this? We cannot use the downloaded magnitude embedding directly in a pytorch network. We need to convert into an indexed tensor and that is what we're doing here. :param tokens: A list of tokens :return: A tensor of dimension (len(tokens), self.embedding.dim) """ tokens_minus_padding = [token for token in tokens if token != self.PAD] # Removing duplicates. Works only on python 3.7. See https://stackoverflow.com/a/7961390/1841522 tokens_minus_padding = list(dict.fromkeys(tokens_minus_padding)) vectors = self.embedding.query(tokens_minus_padding) indexed_embedding = np.zeros((len(vectors) + 1, self.embedding.dim), dtype=np.float32) indexed_embedding[self.stoi[self.PAD]] = np.zeros(self.embedding.dim) for i in range(0, len(vectors)): self.stoi[tokens_minus_padding[i]] = i + 1 # i + 1 because i starts from 0, and 0 is reserved for PAD self.itos[i + 1] = tokens_minus_padding[i] indexed_embedding[i + 1] = vectors[i] return indexed_embedding def get_index_array_from_tokens(self, tokens, maxlen): indices = [self.stoi.get(token, 0) for token in tokens] return np.array(padlist(indices, maxlen))
from util import load_pickle, save_pickle if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--vocab_path', type=str, required=True) parser.add_argument('--embedding_path', type=str, required=True) parser.add_argument('--output_path', type=str, required=True) args = parser.parse_args() vocab_path = args.vocab_path embedding_path = args.embedding_path output_path = args.output_path print("Loading vocab...") vocab = torch.load(vocab_path) word2id = vocab['src'].base_field.vocab.stoi id2word = vocab['src'].base_field.vocab.itos print("vocab size: {0}".format(len(word2id))) print("Loading magnitude...") word_vectors = Magnitude(embedding_path) dim = len(word_vectors.query(id2word[0])) print("Building vocab embedding...") vocab_embedding = torch.zeros((len(word2id), dim)) for w, _id in tqdm(word2id.items()): vocab_embedding[_id] = torch.from_numpy(word_vectors.query(w)) # save vocab embedding print("Saving vocab embedding...") torch.save(vocab_embedding, output_path)
class EmbeddingEngine: """ An interface to query pre-trained word vectors. """ ABBR_LIST = [ "C41H11O11", "PV", "OPV", "PV12", "CsOS", "CsKPSV", "CsPS", "CsHIOS", "OPV", "CsPSV", "CsOPV", "CsIOS", "BCsIS", "CsPrS", "CEsH", "KP307", "AsOV", "CEsS", "COsV", "CNoO", "BEsF", "I2P3", "KP115", "BCsIS", "C9705IS", "ISC0501", "B349S", "CISe", "CISSe", "CsIPS", "CEsP", "BCsF", "CsFOS", "BCY10", "C12P", "EsHP", "CsHP", "C2K8", "CsOP", "EsHS", "CsHS", "C3P", "C50I", "CEs", "CSm", "BF", "EsN", "BN50S", "AsCP", "CPo", "LiPb17", "CsS", "EsIS", "AsCU", "CCsHS", "CsHPU", "AsOS", "AsCI", "EsF", "FV448", "CNS", "CP5", "AsFP", "EsOP", "NS", "NS2", "EsI", "BH", "PPmV", "PSe", "AsN", "OPV5", "NSiW", "CsHHS" ] def __init__(self, embeddings_source=EMBEDDINGS, out_embeddings_source=OUT_EMBEDDINGS, formulas_source=FORMULAS, phraser_source=PHRASER): """ :param embeddings_source: can be instance of a Magnitude object or url or path to a serialized Magnitude object :param out_embeddings_source: can be instance of a Magnitude object or url or path to a serialized Magnitude object :param formulas_source: can be url or path to a JSON-serialized dict of formulae, if not supplied a default file is loaded """ # hidden layer embeddings (W) self.embeddings = Magnitude(embeddings_source, eager=False) # output layer embeddings (O) self.out_embeddings = Magnitude(out_embeddings_source) # load pre-trained formulas from embeddings with open(formulas_source, 'r') as f: self.formulas_with_abbreviations = load(f) self.dp = DataPreparation(local=False) self.es = ElasticConnection() self.formulas = { k: v for k, v in self.formulas_with_abbreviations.items() if k not in self.ABBR_LIST } self.formula_counts = { root_formula: sum(formulas.values()) for root_formula, formulas in self.formulas.items() } self.most_common_forms = { formula_group_name: (formula_group_name if formula_group_name in self.dp.ELEMENTS else max(formulae.items(), key=operator.itemgetter(1))[0]) for formula_group_name, formulae in self.formulas_with_abbreviations.items() } self.phraser = Phraser.load(phraser_source) def make_phrases(self, sentence, reps=2): """ generates phrases from a sentence of words :param sentence: a list of tokens :param reps: how many times to combine the words :return: """ while reps > 0: sentence = self.phraser[sentence] reps -= 1 return sentence def prepare_wordphrase(self, wp, im=False): """ Process a string into words and phrases according to existing embeddings :param wp: the string to process :param im: if True, will ignore missing words, otherwise will generate random vectors :return: a list of processed words and phrases """ processed_wp = self.make_phrases( self.dp.process_sentence(self.dp.text2sent(wp))[0]) if im: processed_wp = [ pwp for pwp in processed_wp if pwp in self.embeddings ] return processed_wp def get_embedding(self, wordphrases, ignore_missing=False, normalized=True): """ Gets the embedding for the given word :param wordphrases: a string or a list of strings to request embedding for :param ignore_missing: if true, will ignore missing words, otherwise will query them using pymagnitude defult out of dictionary handling :param normalized: if False, returns non-normalized embeddings (True by default) :return: an embedding matrix with each row corresponding to a single processed word or phrase taken from wordphrases, as well as the lists of processed wordphrases """ def get_single_embedding(wp, im=ignore_missing, norm=normalized): """ Returns a single embedding vector for the given string :param wp: a string to get a single embedding for :param im: boolen to ignore missing words or return some random vectors if False :param norm: if False, returns the non-normalized embedding (True by default) :return: a single embedding vector for the string (could be a composite embedding) """ processed_wordphrase = self.prepare_wordphrase(wp, im) if len(processed_wordphrase) > 0: emb = np.mean(self.embeddings.query(processed_wordphrase, normalized=norm), axis=0) if norm: emb = emb / np.linalg.norm(emb) emb = emb.tolist() else: emb = [0] * self.embeddings.dim return emb, processed_wordphrase if not isinstance(wordphrases, list): wordphrases = [wordphrases] processed_wps = [] embeddings = [] try: for wordphrase in wordphrases: embedding, processed_wp = get_single_embedding( wordphrase, im=ignore_missing) processed_wps.append(processed_wp) embeddings.append(embedding) except Exception as ex: warnings.warn(ex) return embeddings, processed_wps def close_words(self, positive, negative=None, top_k=8, exclude_self=True, ignore_missing=True): """ Returns a list of close words :param positive: can be either a string or a list of strings :param negative: same as word, but will be treated with a minus sign :param top_k: number of close words to return :param exclude_self: boolean, if the supplied word should be excluded or not :param ignore_missing: ignore words that are missing from the vocabulary :return: (words, scores, processed_positive, processed_negative) """ if negative is None: negative = [] else: if not isinstance(negative, list): negative = [negative] processed_negative = [] for n in negative: processed_negative += self.prepare_wordphrase(n, im=ignore_missing) if not isinstance(positive, list): positive = [positive] processed_positive = [] for p in positive: processed_positive += self.prepare_wordphrase(p, im=ignore_missing) most_similar = self.embeddings.most_similar( processed_positive, negative=processed_negative, topn=top_k) if not exclude_self: most_similar = [(processed_positive, 1.0) ] + most_similar[:top_k - 1] words, scores = map(list, zip(*most_similar)) return words, [float(s) for s in scores], processed_positive, processed_negative def find_similar_materials(self, sentence, n_sentence=None, min_count=3, use_output_emb=True, ignore_missing=True): """ Finds materials that match the best with the context of the sentence :param sentence: a list of words :param n_sentence: a list of words for a negative context :param min_count: the minimum number of occurrences for the formula to be included :param use_output_emb: if True, use output layer embedding (O) instead of inner layer embedding (W) :return: """ positive_embeddings, processed_sentence = \ self.get_embedding(sentence, ignore_missing=ignore_missing) n_sentence = n_sentence or [] negative_embeddings, processed_n_sentence = \ self.get_embedding(n_sentence, ignore_missing=ignore_missing) emb = self.out_embeddings if use_output_emb else self.embeddings sum_embedding = np.sum(np.asarray(positive_embeddings), axis=0) - \ np.sum(np.asarray(negative_embeddings), axis=0) sum_embedding = sum_embedding / np.linalg.norm(sum_embedding) # formulas common enough to be above cut-off and that exist in embedding formulas = [ f for f, count in self.formula_counts.items() if (count > min_count) and (f in self.embeddings) ] similarity_scores = np.dot(emb.query(formulas, normalized=True), sum_embedding) similarities = { f: float(similarity_scores[i]) for i, f in enumerate(formulas) } return sorted(similarities.items(), key=lambda x: x[1], reverse=True), processed_sentence, processed_n_sentence def most_common_form(self, formulas): """ Return the most common form of the formula given a list with tuples [("normalized formula": score), ...] :param formulas: the dictionary :return: a list of common forms with counts, [("common form", score, counts in text), ...] """ common_form_score_count = [] for formula in formulas: if formula[0] in self.dp.ELEMENTS: most_common_form = formula[0] else: most_common_form = max(self.formulas[formula[0]].items(), key=operator.itemgetter(1))[0] common_form_score_count.append( (most_common_form, formula[1], sum(self.formulas[formula[0]].values()))) return common_form_score_count def filter_by_elements(self, formulas, plus_elems=None, minus_elems=None, max=50): """ Filter formulas according to the following rule: It has to have one of the plus_elements (if None all work), but it cannot have any of the minus_elems. If there is an overlap, the element is ignored :param formulas: a list of (formula, score) tuples :param plus_elems: the formula has to have at least one of these :param minus_elems: but cannot have any of these :param max: maximum number to return :return: """ plus_elems = plus_elems or [] minus_elems = minus_elems or [] plus_elems, minus_elems = set(plus_elems) - set(minus_elems), set( minus_elems) - set(plus_elems) def has_plus(comp, pe): if pe is None or len(pe) == 0: return True for elem in comp: if elem in pe: return True return False def has_minus(comp, me): if me is None or len(me) == 0: return False for elem in comp: if elem in me: return True return False matched = 0 matched_formula = [] for form in formulas: composition = self.dp.parser.parse_formula(form[0]) if has_plus(composition, plus_elems) and not has_minus( composition, minus_elems): matched_formula.append(form) matched += 1 if matched >= max: return matched_formula return matched_formula def mentioned_with(self, material, words): """ Returns True if the supplied material was mentioned with any of the words in any of the abstracts. This is a very strict text search and is aimed at high recall. This method is used for discovery so having higher recall might hinder some discoveries but will avoid too many false positives. E.g. for material=CuTe and words=["thermoelectric"], "CuTe2 is thermoelectric" will return True since "CuTe" will be matched with "CuTe2" in text search. The word search is exact, so if the keyword was "thermo" it would not match "thermoelectric". :param material: A material formula (does not have to be normalized) :param words: List of processed words and phrases (words separated by "_") to search the text for co-occurrences :return: True if the material is mentioned with any of the words, False otherwise """ norm_material = self.dp.get_norm_formula( material) if self.dp.is_simple_formula(material) else material # different ways the material is written variations = self.formulas[ norm_material] if norm_material in self.formulas else [ norm_material ] variations = "(" + " OR ".join(variations) + ")" targets = "(" + " OR ".join(words) + ")" query = "{} AND {}".format(targets, variations) if self.es.count_matches(query) > 0: return True else: return False