Exemplo n.º 1
0
def put_embeddings(self, rSubmission: RedditSubmission):
    vecs = Magnitude('word2vec/light/GoogleNews-vectors-negative300')

    rSubmission.post_title_embedding = np.mean(vecs.query(
        rSubmission.post_title.split()),
                                               axis=0)

    if len(rSubmission.post_text) > 0:
        rSubmission.post_text_embedding = np.mean(vecs.query(
            rSubmission.post_text.split()),
                                                  axis=0)

    return rSubmission
Exemplo n.º 2
0
def put_embeddings(self, rSubmission: RedditSubmission):
    vecs = Magnitude('word2vec/light/GoogleNews-vectors-negative300')
    # vecs = Magnitude('http://magnitude.plasticity.ai/word2vec/light/GoogleNews-vectors-negative300.magnitude')

    rSubmission.post_title_embedding = np.mean(vecs.query(
        rSubmission.post_title.split()),
                                               axis=0)

    if len(rSubmission.post_text) > 0:
        rSubmission.post_text_embedding = np.mean(vecs.query(
            rSubmission.post_text.split()),
                                                  axis=0)

    logger.info('Embedded submission: ', rSubmission.post_title)

    return rSubmission
Exemplo n.º 3
0
def create_vocab_tensors(input_vocab_index):
    """Creates a matrix of the glove embeddings for terms contained in the model for improve runtime
        Also used in ESIM"""
    print('Creating vocabulary tensors...')
    # Define GloVe model from Magnitude package
    model = Magnitude(config.glove_magnitude_path)

    np.random.seed(config.SEED)
    # Randomly initialize matrix
    vocab_tensors = np.random.normal(
        0, 1, (input_vocab_index.n_words, model.dim)).astype('float32')

    vocab_words = list(input_vocab_index.word2index.keys())
    unk_words = []

    # Get vector for each word in vocabulary if in model
    for idx, word in enumerate(vocab_words):
        if word in model:
            vocab_tensors[idx] = model.query(word)
        else:
            unk_words.append(word)

    # Override special tokens
    special_tokens = ['SOS', 'EOS', 'UNK']

    # Override special tokens
    vocab_tensors[:len(special_tokens), :] = np.random.uniform(
        -0.1, 0.1, (len(special_tokens), model.dim)).astype('float32')

    print('Tensor vocabulary complete.')
    print('    Total vocabulary size {}, {} UNK words ({:.2}%)'.format(
        len(vocab_words), len(unk_words),
        (len(unk_words) / len(vocab_words)) * 100))
    return torch.tensor(vocab_tensors, dtype=torch.float64), unk_words
Exemplo n.º 4
0
class GloveEncoder():
    """Encodes an input sentence as a mean or max pooled sentence embedding given the individual word embeddings"""
    def __init__(self, pooling='mean'):
        self.name = 'GloveEncoder'
        self.trainable_model = False
        self.pooling = pooling
        self.model = Magnitude(config.glove_magnitude_path)
        self.hidden_size = self.model.dim

    def sentence_embedding(self, input_text):
        words_in_model = [
            word for word in input_text.split() if word in self.model
        ]
        sentence_embedding = np.zeros((len(words_in_model), self.model.dim))
        sentence_embedding.fill(np.nan)

        for idx, token in enumerate(words_in_model):
            sentence_embedding[idx] = self.model.query(token)

        if self.pooling == 'max':
            sentence_embedding = np.max(sentence_embedding, axis=0)

        else:
            sentence_embedding = np.mean(sentence_embedding, axis=0)

        return torch.tensor(sentence_embedding.reshape(1, 1, -1),
                            device=DEVICE)
Exemplo n.º 5
0
class MagnitudeFactory(EmbeddingFactory):
    def __init__(self, embedding_type: EmbeddingType):
        super().__init__(embedding_type)

        cache_dir = Path(fs.get_project_root_dir()) / ".magnitude"
        fs.mkdir_if_not_exists(cache_dir)
        embed_file = self._embedding_type.url[self._embedding_type.url.
                                              rfind("/") + 1:]
        compressed_file = Path(cache_dir) / embed_file
        if not compressed_file.exists():
            logger.info(
                '  Downloading magnitude file ("{}")...'.format(embed_file))
            wget.download(self._embedding_type.url, compressed_file)

        self._embed_file = compressed_file
        logger.info('  Loading Magnitude module...')
        self._magnitude_vecs = Magnitude(self._embed_file)

    def build(self, vocab_list: List[str],
              **kwargs) -> (List[str], List[str], Dict[str, Dict[str, Any]]):
        oov, iov = [], []

        vec_dict = {}
        for w in vocab_list:
            is_oov = w not in self._magnitude_vecs
            vec = self._magnitude_vecs.query(w)
            vec_dict[w] = {"vec": vec, "trainable": is_oov}

            if is_oov:
                oov.append(w)
            else:
                iov.append(w)

        return oov, iov, vec_dict
Exemplo n.º 6
0
class MagnitudeFactory(EmbeddingFactory):
    def __init__(self, embedding_type: EmbeddingType):
        super().__init__(embedding_type)

        cache_dir = Path(fs.get_project_root_dir()) / ".magnitude"
        fs.mkdir_if_not_exists(cache_dir)
        embed_file = self._embedding_type.url[self._embedding_type.url.
                                              rfind("/") + 1:]
        compressed_file = Path(cache_dir) / embed_file
        if not compressed_file.exists():
            logger.info(
                '  Downloading magnitude file ("{}")...'.format(embed_file))
            wget.download(self._embedding_type.url, compressed_file)

        self._embed_file = compressed_file
        logger.info('  Loading Magnitude module...')
        self._magnitude_vecs = Magnitude(self._embed_file)

    def build(self, vocab_list: List[str], h5_file: Path,
              **kwargs) -> (List[str], List[str]):
        oov, iov = [], []
        with h5py.File(h5_file, mode="w") as vec_h5:
            for w in vocab_list:
                is_oov = w not in self._magnitude_vecs
                vec = self._magnitude_vecs.query(w)
                vec_h5.create_dataset("{key}/vec".format(key=w), data=vec)
                vec_h5.create_dataset("{key}/trainable".format(key=w),
                                      data=1 if is_oov else 0)

                if is_oov:
                    oov.append(w)
                else:
                    iov.append(w)

        return oov, iov
Exemplo n.º 7
0
def glove_via_magnitude(topn=500,
                        min_similarity=None,
                        filename='glove.6B.100d.magnitude',
                        lang='en_US'):

    from pymagnitude import Magnitude

    v = Magnitude(os.path.join(TOPDIR, filename))
    training_set = list()
    units = set()
    for unit_list in classifier.ambiguous_units():
        for unit in unit_list[1]:
            units.add(unit)
    for unit in units:
        print('Processing {}...'.format(unit.name))

        name = unit.name
        surfaces = set(unit.name)
        if isinstance(unit, classes.Unit):
            surfaces.update(unit.surfaces)
            surfaces.update(unit.symbols)
        for surface in surfaces:
            neighbours = v.most_similar(
                v.query(surface), topn=topn, min_similarity=min_similarity)
            training_set.append({
                'unit':
                name,
                'text':
                ' '.join(neighbour[0] for neighbour in neighbours)
            })
    print('Done')

    with language.topdir(lang).joinpath('train/similars.json').open(
            'w', encoding='utf-8') as file:
        json.dump(training_set, file, sort_keys=True, indent=4)
Exemplo n.º 8
0
def get_word_vector(word):
    global model
    if model is None:
        # import fasttext
        # if os.environ.get('LANGUAGE', 'en').lower() == 'en':
        #     print('Loading English word vectors')
        #     model = fasttext.load_model('data/cc.en.300.bin')
        # else:
        #     print('Loading Vietnamese word vectors')
        #     model = fasttext.load_model('data/cc.vi.300.bin')

        # return model.get_word_vector(word.replace(' ', '_'))

        from pymagnitude import Magnitude

        if os.environ.get('LANGUAGE', 'en').lower() == 'en':
            print('Loading English word vectors')
            model = Magnitude('data/cc.en.300.magnitude',
                              language='en',
                              lazy_loading=20000)
        else:
            print('Loading Vietnamese word vectors')
            model = Magnitude('data/cc.vi.300.magnitude',
                              language='vi',
                              lazy_loading=20000)

        print('Loading completed')

    return model.query(word)
Exemplo n.º 9
0
class PyMagnitudeField(Field):
    def __init__(self,
                 magnitude_vector_filepath,
                 sequential=True,
                 lower=True,
                 tokenize=(lambda s: s.split()),
                 include_lengths=True,
                 batch_first=True,
                 **kwargs):

        if kwargs.get('use_vocab'):
            kwargs['use_vocab'] = False
        self.vectors = Magnitude(magnitude_vector_filepath)
        super(PyMagnitudeField, self).__init__(sequential=sequential,
                                               lower=lower,
                                               tokenize=tokenize,
                                               include_lengths=include_lengths,
                                               batch_first=batch_first,
                                               **kwargs)

    def build_vocab(self, *args, **kwargs):
        pass

    def process(self, batch, device, train):
        if self.include_lengths:
            batch = (batch, [len(x) for x in batch])
        return self.numericalize(batch, device=device, train=train)

    def numericalize(self, arr, device=torch.device('cpu'), train=True):

        if self.include_lengths and not isinstance(arr, tuple):
            raise ValueError('Field has include_lengths set to True, but '
                             'input data is not a tuple of '
                             '(data batch, batch lengths).')

        if isinstance(arr, tuple):
            arr, lengths = arr
            lengths = torch.LongTensor(lengths)

        arr = torch.from_numpy(self.vectors.query(arr))
        if self.sequential and not self.batch_first:
            arr.t_()

        if device.type == 'cpu':
            if self.sequential:
                arr = arr.contiguous()
        else:
            arr = arr.cuda(device)
            if self.include_lengths:
                lengths = lengths.cuda(device)

        arr.requires_grad = False
        if self.include_lengths:
            return arr, lengths
        return arr
Exemplo n.º 10
0
def build_vectors_file(vec_model: Magnitude,
                       words: List[str],
                       out_file="ENT.vec"):
    print(f"Building {out_file}")
    with open(out_file, "w") as out:
        out.write(f"{len(words)} {vec_model.dim}\n")
        for e in words:
            v = vec_model.query(e)
            str_vec = " ".join(map(str, v))  # ndarray to str
            line = f"{e} {str_vec}\n"
            out.write(line)
Exemplo n.º 11
0
def test_embedtext_creation():
    extractor_cfg = {
        "_name": "embedtext",
        "index": "anserini",
        "tokenizer": "anserini",
        "embeddings": "glove6b",
        "zerounk": True,
        "calcidf": True,
        "maxqlen": MAXQLEN,
        "maxdoclen": MAXDOCLEN,
    }
    extractor = EmbedText(extractor_cfg)

    benchmark = DummyBenchmark({"_fold": "s1", "rundocsonly": False})
    collection = DummyCollection({"_name": "dummy"})

    index_cfg = {"_name": "anserini", "indexstops": False, "stemmer": "porter"}
    index = AnseriniIndex(index_cfg)
    index.modules["collection"] = collection

    tok_cfg = {"_name": "anserini", "keepstops": True, "stemmer": "none"}
    tokenizer = AnseriniTokenizer(tok_cfg)

    extractor.modules["index"] = index
    extractor.modules["tokenizer"] = tokenizer

    qids = list(benchmark.qrels.keys())  # ["301"]
    qid = qids[0]
    docids = list(benchmark.qrels[qid].keys())

    extractor.create(qids, docids, benchmark.topics[benchmark.query_type])

    expected_vocabs = [
        "lessdummy", "dummy", "doc", "hello", "greetings", "world", "from",
        "outer", "space", "<pad>"
    ]
    expected_stoi = {s: i for i, s in enumerate(expected_vocabs)}

    assert set(extractor.stoi.keys()) == set(expected_stoi.keys())

    emb_path = "glove/light/glove.6B.300d"
    fullemb = Magnitude(MagnitudeUtils.download_model(emb_path))
    assert extractor.embeddings.shape == (len(expected_vocabs), fullemb.dim)

    for i in range(extractor.embeddings.shape[0]):
        if i == extractor.pad:
            assert extractor.embeddings[i].sum() < 1e-5
            continue
        s = extractor.itos[i]
        assert (extractor.embeddings[i] - fullemb.query(s)).sum() < 1e-5
    return extractor
Exemplo n.º 12
0
def read_magnitude_vectors(magnitude_filepath,
                           vocab_filepath,
                           vocab_size,
                           dim,
                           special_tokens=[UNK]):
    """Read word vectors from *.magnitude

    Args:
        magnitude_filepath (str): magnituide file path
        vocab_filepath (str): vocabulary file path
        vocab_size (int): Maximum vocab size (including special tokens)
        dim (int): Dimension of the word vectors to load
        special_tokens (list[str])

    Return:
        words (list[str]): list of length vocab_size
        embeddings (np.array): (vocab_size, dim)
    """
    logging.info('Loading word vectors from %s', magnitude_filepath)
    words = [x for x in special_tokens]
    word_set = set()
    with open(vocab_filepath, 'r', 'utf8') as fin:
        for line in fin:
            word = line.strip().split("\t")[0]
            if word in word_set:
                logging.warning(
                    "token must be unique. non-unique token='{}'".format(word))
            elif len(word) > 0:
                word_set.add(word)
                words.append(word)
                if len(words) == vocab_size:
                    break
    magnitude = Magnitude(magnitude_filepath,
                          case_insensitive=True,
                          normalized=True)
    vectors = magnitude.query(words[len(special_tokens):])
    # special vectors for UNK
    special_vectors = np.random.normal(size=(len(special_tokens), dim))
    special_vectors /= np.linalg.norm(special_vectors,
                                      ord=2,
                                      axis=1,
                                      keepdims=True)
    # Concatenate
    vectors = np.vstack([special_vectors, vectors]).astype('float32')
    assert vectors.shape[0] == len(words)
    assert vectors.shape[1] == dim
    logging.info('Loaded %d word vectors; shape = %s', len(words),
                 str(vectors.shape))
    return words, vectors
Exemplo n.º 13
0
 def _build_matrix(self, tokenizer):
     vector = Magnitude('vectors/glove.6B.50d.magnitude')
     GLOVE_VECTOR_DIMENSION = 50
     MAX_NUM_WORDS = 300
     word_index = tokenizer.word_index
     num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
     embedding_matrix = np.zeros((num_words, GLOVE_VECTOR_DIMENSION))
     for word, i in tqdm(word_index.items()):
         if i > MAX_NUM_WORDS:
             continue
         embedding_vector = vector.query(word)
         if embedding_vector is not None:
             # words not found in embedding index will be all-zeros.
             embedding_matrix[i] = embedding_vector
     return embedding_matrix
Exemplo n.º 14
0
class WordVectorFeature(FeatureExtractor):
    def __init__(self, vectors_path: str, scaling: float = 1.0) -> None:
        self.vectors = Magnitude(vectors_path, normalized=False)
        self.scaling = scaling

    def extract(
        self,
        token: str,
        current_idx: int,
        relative_idx: int,
        tokens: Sequence[str],
        features: Dict[str, float],
    ) -> None:
        vector = self.vectors.query(token)
        keys = ["v" + str(i) for i in range(len(vector))]
        values = vector * self.scaling
        features.update(zip(keys, values))
Exemplo n.º 15
0
def avg_feature_vector(sentence, num_features = 300):
    """
	Generates Word2Vec embeddings for a text.
	
	:param sentence: text to generate embeddings for.
	:param num_features: feature vector length
	:return: embeddings feature vector
	"""
    vectors = Magnitude('models/GoogleNews-vectors-negative300.magnitude')
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        feature_vec = vectors.query(word)
        n_words += 1
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec
Exemplo n.º 16
0
class WordVectorFeature(FeatureExtractor):
    def __init__(self, vectors_path: str, scaling: float = 1.0) -> None:
        self.scaling = scaling
        self.vectors = Magnitude(vectors_path, normalized=False)
        self.keys = ['v'+repr(i) for i in range(self.vectors.dim)]

    def extract(
        self,
        token: str,
        current_idx: int,
        relative_idx: int,
        tokens: Sequence[str],
        features: Dict[str, float],
        idx: int,
    ) -> None:
        if relative_idx == 0:
            v = self.scaling * self.vectors.query(token)
            features.update(zip(self.keys,v))
Exemplo n.º 17
0
    def get_pretrained_word_embed(self, vocab, magnitude='GoogleNews-vectors-negative300.magnitude', verbose=1):
        """
        loads word embeddings from magnitude into pytorch

        Return:
        -----
            pretrained_word_embed: torch.Tensor of size (Vocab x word embedding size)
            unknown_words: list of unknown words that are not in the prefit embeddings
        """
        magnitude = Magnitude(magnitude)
        
        pretrained_word_embed = [torch.from_numpy(magnitude.query(word)).type(torch.cuda.FloatTensor).view(1,-1)
                                for word in vocab]
        pretrained_word_embed = torch.cat(pretrained_word_embed, axis=0)
        
        self.unknown_words = [word for word in vocab if word not in magnitude]
        n_unknown_words = len(self.unknown_words)
        if verbose: print(f"Unknown words: {n_unknown_words} / {len(vocab)}")
        
        return pretrained_word_embed
Exemplo n.º 18
0
class WordVectorFeature(FeatureExtractor):
    def __init__(self, vectors_path: str, scaling: float = 1.0) -> None:

        self.mag = Magnitude(vectors_path, normalized=False)
        self.scaling = scaling

    def extract(
        self,
        token: str,
        current_idx: int,
        relative_idx: int,
        tokens: Sequence[str],
        features: Dict[str, float],
    ) -> None:
        # let's just write the suggested lines right here, and see how much they hook up
        if relative_idx == 0:
            values = self.mag.query(
                token) * self.scaling  # Scaling matters lol
            #print(token, current_idx, tokens, values)
            keys = ["v" + str(i) for i in range(len(values))]
            features.update(zip(keys, values))
class WordVectorFeature(FeatureExtractor):
    def __init__(self, vectors_path: str, scaling: float = 1.0) -> None:
        self.magnitude_word_vector = Magnitude(vectors_path, normalized=False)
        self.scale = scaling

    def extract(
        self,
        token: str,
        current_idx: int,
        relative_idx: int,
        tokens: Sequence[str],
        features: Dict[str, float],
    ) -> None:
        if relative_idx == 0:
            word_vector = self.magnitude_word_vector.query(token)
            scaled_word_vector = word_vector * self.scale
            keys = []
            for i in range(len(scaled_word_vector)):
                keys.append("v" + str(i))
            features.update(zip(keys, scaled_word_vector))
        return features
Exemplo n.º 20
0
def preprocess(file_name, m, word2vec_magnitude_file):
    raw_data = open(file_name)
    X_pickle_file = open(file_name[:-4] + "_X.p", 'wb')
    y_pickle_file = open(file_name[:-4] + "_y.p", 'wb')
    comments = []
    y = []

    for line in raw_data:
        y.append(int(line.split('\t')[0]))
        line = line.split('\t')[1]
        line = re.sub(r'\d+', '', line)
        tokens = word_tokenize(line)
        comments.append(tokens)

    #max_len = len(max(comments,key=len)) #denote n
    max_len = 100
    # matrix m x n x d, m = sample, n = max_len of sentence, d = word vector size
    # TODO: If time allows, train on our data. For now, we just Google's pretrained model
    vector_size = 300
    #model = gensim.models.Word2Vec(comments, size=vector_size, window=8, min_count=1, workers=10)
    #model.train(comments, total_examples=len(comments), epochs=10)
    #TODO: Use FastText to handle unseen words, also Magnitude!
    model = Magnitude(word2vec_magnitude_file)
    X = zeros([m, max_len, vector_size])
    #start = time.time()
    for comment_idx, sentence in enumerate(comments):
        for sentence_idx, word in enumerate(sentence):
            if (sentence_idx == max_len - 1): break
            X[comment_idx][sentence_idx] = model.query(word)
        #if(comment_idx % 250 == 0):
        #       print(comment_idx)
        #      print(time_since(start))
    pickle.dump(X, X_pickle_file)
    pickle.dump(y, y_pickle_file)
    X_pickle_file.close()
    y_pickle_file.close()
Exemplo n.º 21
0
class MagnitudeTest(unittest.TestCase):
    MAGNITUDE_PATH = ""
    MAGNITUDE_SUBWORD_PATH = ""
    MAGNITUDE_APPROX_PATH = ""

    def setUp(self):
        self.vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                 case_insensitive=True,
                                 eager=True)
        self.vectors_cs = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                    case_insensitive=False,
                                    eager=False)
        self.vectors_sw = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                    case_insensitive=True,
                                    eager=False)
        self.vectors_approx = Magnitude(MagnitudeTest.MAGNITUDE_APPROX_PATH,
                                        case_insensitive=True,
                                        eager=False)
        self.tmp_vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                     case_insensitive=True,
                                     eager=False)
        self.concat_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                  case_insensitive=True,
                                  eager=False)
        self.concat_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                  case_insensitive=True,
                                  eager=False)
        self.concat = Magnitude(self.concat_1, self.concat_2)
        self.vectors_feat = FeaturizerMagnitude(100, case_insensitive=True)
        self.v = {
            'padding': self.tmp_vectors._padding_vector(),
            'I': self.tmp_vectors.query("I"),
            'saw': self.tmp_vectors.query("saw"),
            'a': self.tmp_vectors.query("a"),
            'cat': self.tmp_vectors.query("cat"),
            'He': self.tmp_vectors.query("He"),
            'went': self.tmp_vectors.query("went"),
            'to': self.tmp_vectors.query("to"),
            'the': self.tmp_vectors.query("the"),
            'mall': self.tmp_vectors.query("mall"),
            'blah123': self.tmp_vectors.query("blah123")
        }

    def tearDown(self):
        self.vectors.close()
        self.vectors_cs.close()
        self.vectors_sw.close()
        self.tmp_vectors.close()
        self.concat_1.close()
        self.concat_2.close()
        del self.concat
        self.vectors_feat.close()
        gc.collect()

    def test_length(self):
        self.assertEqual(len(self.vectors), 3000000)

    def test_dim(self):
        self.assertEqual(self.vectors.dim, 300)

    def test_index(self):
        self.assertTrue(isinstance(self.vectors[0][0], unicode))
        self.assertTrue(isinstance(self.vectors[0][1], np.ndarray))
        self.assertTrue(isinstance(self.vectors.index(0)[0], unicode))
        self.assertTrue(isinstance(self.vectors.index(0)[1], np.ndarray))
        self.assertTrue(
            isinstance(self.vectors.index(0, return_vector=False), unicode))

    def test_slice(self):
        sliced = self.vectors[0:5]
        self.assertEqual(len(sliced), 5)
        self.assertEqual(sliced[0][0], self.vectors[0][0])
        self.assertTrue(isclose(sliced[0][1], self.vectors[0][1]).all())

    def test_case_insensitive(self):
        some_keys_are_not_lower = False
        for i, (k, _) in enumerate(self.vectors):
            if i > 1000:
                break
            some_keys_are_not_lower = (some_keys_are_not_lower
                                       or k.lower() != k)
        self.assertTrue(some_keys_are_not_lower)
        self.assertTrue("QuEEn" in self.vectors)
        self.assertTrue("QUEEN" in self.vectors)
        self.assertTrue("queen" in self.vectors)
        self.assertTrue(
            isclose(self.vectors.query("Queen"),
                    self.vectors.query("QuEEn")).all())
        self.assertEqual(
            self.vectors.most_similar("I", return_similarities=False)[0],
            'myself')
        self.assertEqual(
            self.vectors.most_similar("i", return_similarities=False)[0],
            'ive')
        self.assertTrue(self.vectors.similarity("a", "A") > .9)

    def test_case_sensitive(self):
        some_keys_are_not_lower = False
        for i, (k, _) in enumerate(self.vectors_cs):
            if i > 1000:
                break
            some_keys_are_not_lower = (some_keys_are_not_lower
                                       or k.lower() != k)
        self.assertTrue(some_keys_are_not_lower)
        self.assertTrue("QuEEn" not in self.vectors_cs)
        self.assertTrue("QUEEN" in self.vectors_cs)
        self.assertTrue("queen" in self.vectors_cs)
        self.assertTrue(not isclose(self.vectors_cs.query("Queen"),
                                    self.vectors_cs.query("QuEEn")).all())
        self.assertEqual(
            self.vectors_cs.most_similar("I", return_similarities=False)[0],
            'myself')
        self.assertEqual(
            self.vectors_cs.most_similar("i", return_similarities=False)[0],
            'ive')
        self.assertTrue(self.vectors_cs.similarity("a", "A") > .9)

    def test_iter_case_insensitive(self):
        for _ in range(2):
            for i, (k, v) in enumerate(self.vectors):
                if i > 1000:
                    break
                k2, v2 = self.vectors[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_iter_case_sensitive(self):
        for _ in range(2):
            for i, (k, v) in enumerate(self.vectors_cs):
                if i > 1000:
                    break
                k2, v2 = self.vectors_cs[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_index_case_insensitive(self):
        for _ in range(2):
            viter = iter(self.vectors)
            for i in range(len(self.vectors)):
                if i > 1000:
                    break
                k, v = next(viter)
                k2, v2 = self.vectors[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_index_case_sensitive(self):
        for _ in range(2):
            viter = iter(self.vectors_cs)
            for i in range(len(self.vectors_cs)):
                if i > 1000:
                    break
                k, v = next(viter)
                k2, v2 = self.vectors_cs[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_bounds(self):
        length = len(self.vectors)
        self.assertTrue(isinstance(self.vectors[length - 1][0], unicode))
        self.assertTrue(isinstance(self.vectors[length - 1][1], np.ndarray))

    @unittest.expectedFailure
    def test_out_of_bounds(self):
        length = len(self.vectors)
        self.assertTrue(isinstance(self.vectors[length][0], unicode))
        self.assertTrue(isinstance(self.vectors[length][1], np.ndarray))

    def test_contains(self):
        self.assertTrue("cat" in self.vectors)

    def test_contains_false(self):
        self.assertTrue("blah123" not in self.vectors)

    def test_special_characters(self):
        self.assertTrue("Wilkes-Barre/Scranton" in self.vectors)
        self.assertTrue("out-of-vocabulary" not in self.vectors)
        self.assertTrue('quotation"s' not in self.vectors)
        self.assertTrue("quotation's" not in self.vectors)
        self.assertTrue("colon;s" not in self.vectors)
        self.assertTrue("sh**" not in self.vectors)
        self.assertTrue("'s" not in self.vectors_cs)
        self.assertTrue('"s' not in self.vectors)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("Wilkes-Barre/Scranton").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("out-of-vocabulary").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query('quotation"s').shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("quotation's").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("colon;s").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("sh**").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors_cs.query("'s").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query('"s').shape)

    def test_oov_dim(self):
        self.assertEqual(
            self.vectors.query("*<<<<").shape,
            self.vectors.query("cat").shape)

    def test_oov_subword_dim(self):
        self.assertEqual(
            self.vectors_sw.query("*<<<<").shape,
            self.vectors_sw.query("cat").shape)

    def test_oov_dim_placeholders(self):
        self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                              placeholders=5,
                                              case_insensitive=True,
                                              eager=False)
        self.assertEqual(
            self.vectors_placeholders.query("*<<<<").shape,
            self.vectors_placeholders.query("cat").shape)
        self.assertTrue(
            isclose(
                self.vectors.query("*<<<<")[0],
                self.vectors_placeholders.query("*<<<<")[0]))
        self.vectors_placeholders.close()

    def test_oov_subword_dim_placeholders(self):
        self.vectors_placeholders = Magnitude(
            MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
            placeholders=5,
            case_insensitive=True,
            eager=False)
        self.assertEqual(
            self.vectors_placeholders.query("*<<<<").shape,
            self.vectors_placeholders.query("cat").shape)
        self.assertTrue(
            isclose(
                self.vectors.query("*<<<<")[0],
                self.vectors_placeholders.query("*<<<<")[0]))
        self.vectors_placeholders.close()

    def test_oov_unit_norm(self):
        self.assertTrue(
            isclose(np.linalg.norm(self.vectors.query("*<<<<<")), 1.0))

    def test_oov_subword_unit_norm(self):
        self.assertTrue(
            isclose(np.linalg.norm(self.vectors_sw.query("*<<<<<")), 1.0))

    def test_ngram_oov_closeness(self):
        self.assertTrue(self.vectors.similarity("uberx", "uberxl") > .7)
        self.assertTrue(self.vectors.similarity("uberx", "veryrandom") < .7)
        self.assertTrue(
            self.vectors.similarity("veryrandom", "veryrandom") > .7)

    def test_ngram_oov_subword_closeness(self):
        self.assertTrue(self.vectors_sw.similarity("uberx", "uberxl") > .7)
        self.assertTrue(self.vectors_sw.similarity("uberx", "uber") > .7)
        self.assertTrue(self.vectors_sw.similarity("uberxl", "uber") > .7)
        self.assertTrue(
            self.vectors_sw.similarity("discriminatoryy", "discriminatory") >
            .7)
        self.assertTrue(
            self.vectors_sw.similarity("discriminatoryy", "discriminnatory") >
            .8)
        self.assertTrue(self.vectors_sw.similarity("uberx", "veryrandom") < .7)
        self.assertTrue(
            self.vectors_sw.similarity("veryrandom", "veryrandom") > .7)
        self.assertTrue(self.vectors_sw.similarity("hiiiiiiiii", "hi") > .7)
        self.assertTrue(self.vectors_sw.similarity("heeeeeeeey", "hey") > .7)
        self.assertTrue(self.vectors_sw.similarity("heyyyyyyyyyy", "hey") > .7)
        self.assertTrue(self.vectors_sw.similarity("faaaaaate", "fate") > .65)

    def test_oov_values(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<")[0], -0.0372075283555))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<")[0], -0.0201727917272))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<<")[0], -0.0475993225776))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<<<")[0], 0.0129938352266))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<")[0], -0.0372075283555))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<")[0], -0.0201727917272))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<<")[0], -0.0475993225776))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<<<")[0], 0.0129938352266))

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_oov_subword_values(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        self.assertTrue(
            isclose(
                self.vectors_oov_1.query("discriminatoryy")[0],
                -0.0573252095591))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("uberx")[0], 0.0952671681336))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("misssipi")[0], 0.0577835297955))
        self.assertTrue(
            isclose(
                self.vectors_oov_2.query("discriminatoryy")[0],
                -0.0573252095591))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("uberx")[0], 0.0952671681336))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("misssipi")[0], 0.0577835297955))

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_oov_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_ngram_oov_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=True,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=True,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_ngram_oov_subword_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_placeholders(self):
        self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                              case_insensitive=True,
                                              placeholders=5,
                                              eager=False)
        self.assertEqual(self.vectors_placeholders.query("cat").shape, (305, ))
        self.assertEqual(
            self.vectors_placeholders.query("cat")[0],
            self.vectors.query("cat")[0])
        self.vectors_placeholders.close()

    def test_numpy(self):
        self.assertTrue(isinstance(self.vectors.query("cat"), np.ndarray))

    def test_list(self):
        self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                      case_insensitive=True,
                                      use_numpy=False,
                                      eager=False)
        self.assertTrue(isinstance(self.vectors_list.query("cat"), list))
        self.vectors_list.close()

    def test_repeated_single(self):
        q = "cat"
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())

    def test_repeated_multiple(self):
        q = ["I", "saw", "a", "cat"]
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())

    def test_multiple(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q)
        self.assertEqual(result.shape, (2, 5, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        return result

    def test_pad_to_length_right_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[0][5], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        self.assertTrue(isclose(result[1][5], self.v['padding']).all())
        return result

    def test_pad_to_length_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[0][5], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        self.assertTrue(isclose(result[1][5], self.v['padding']).all())
        return result

    def test_pad_to_length_left_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6, pad_left=True)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['padding']).all())
        self.assertTrue(isclose(result[0][1], self.v['padding']).all())
        self.assertTrue(isclose(result[0][2], self.v['I']).all())
        self.assertTrue(isclose(result[0][3], self.v['saw']).all())
        self.assertTrue(isclose(result[0][4], self.v['a']).all())
        self.assertTrue(isclose(result[0][5], self.v['cat']).all())
        self.assertTrue(isclose(result[1][0], self.v['padding']).all())
        self.assertTrue(isclose(result[1][1], self.v['He']).all())
        self.assertTrue(isclose(result[1][2], self.v['went']).all())
        self.assertTrue(isclose(result[1][3], self.v['to']).all())
        self.assertTrue(isclose(result[1][4], self.v['the']).all())
        self.assertTrue(isclose(result[1][5], self.v['mall']).all())
        return result

    def test_pad_to_length_truncate_right(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=3)
        self.assertEqual(result.shape, (2, 3, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        return result

    def test_pad_to_length_truncate_left(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=3, truncate_left=True)
        self.assertEqual(result.shape, (2, 3, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1], self.v['a']).all())
        self.assertTrue(isclose(result[0][2], self.v['cat']).all())
        self.assertTrue(isclose(result[1][0], self.v['to']).all())
        self.assertTrue(isclose(result[1][1], self.v['the']).all())
        self.assertTrue(isclose(result[1][2], self.v['mall']).all())
        return result

    def test_list_multiple(self):
        self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                      case_insensitive=True,
                                      use_numpy=False,
                                      eager=False)
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        self.assertTrue(isinstance(self.vectors_list.query(q[0]), list))
        self.assertTrue(
            isclose(self.vectors.query(q[0]),
                    asarray(self.vectors_list.query(q[0]))).all())
        self.assertTrue(isinstance(self.vectors_list.query(q), list))
        self.assertTrue(
            isclose(self.vectors.query(q),
                    asarray(self.vectors_list.query(q))).all())
        self.vectors_list.close()

    def test_concat(self):
        q = "cat"
        result = self.concat.query(q)
        self.assertEqual(result.shape, (self.vectors.dim * 2, ))
        self.assertTrue(isclose(result[0:300], self.v['cat']).all())
        self.assertTrue(isclose(result[300:600], self.v['cat']).all())

    def test_concat_multiple(self):
        q = ["I", "saw"]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[1][300:600], self.v['saw']).all())

    def test_concat_multiple_2(self):
        q = [["I", "saw"], ["He", "went"]]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1][300:600], self.v['saw']).all())
        self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all())
        self.assertTrue(isclose(result[1][0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all())
        self.assertTrue(isclose(result[1][1][300:600], self.v['went']).all())

    def test_concat_specific(self):
        q = ("cat", "mall")
        result = self.concat.query(q)
        self.assertEqual(result.shape, (self.vectors.dim * 2, ))
        self.assertTrue(isclose(result[0:300], self.v['cat']).all())
        self.assertTrue(isclose(result[300:600], self.v['mall']).all())

    def test_concat_multiple_specific(self):
        q = [("I", "He"), ("saw", "went")]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[1][300:600], self.v['went']).all())

    def test_concat_multiple_2_specific(self):
        q = [[("I", "He"), ("saw", "went")], [("He", "I"), ("went", "saw")]]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1][300:600], self.v['went']).all())
        self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all())
        self.assertTrue(isclose(result[1][0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all())
        self.assertTrue(isclose(result[1][1][300:600], self.v['saw']).all())

    def test_distance(self):
        self.assertTrue(
            isclose(self.vectors.distance("cat", "dog"), 0.69145405))

    def test_distance_multiple(self):
        self.assertTrue(
            isclose(self.vectors.distance("cat", ["cats", "dog"]),
                    [0.61654216, 0.69145405]).all())

    def test_similarity(self):
        self.assertTrue(
            isclose(self.vectors.similarity("cat", "dog"), 0.7609457089782209))

    def test_similarity_multiple(self):
        self.assertTrue(
            isclose(self.vectors.similarity("cat", ["cats", "dog"]),
                    [0.8099378824686305, 0.7609457089782209]).all())

    def test_most_similar_to_given(self):
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["dog", "television", "laptop"]), "dog")
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["television", "dog", "laptop"]), "dog")
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["television", "laptop", "dog"]), "dog")

    def test_doesnt_match(self):
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "cereal", "lunch", "dinner"]), "cereal")
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "lunch", "cereal", "dinner"]), "cereal")
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "lunch", "dinner", "cereal"]), "cereal")

    def test_most_similar_case_insensitive(self):
        keys = [s[0] for s in self.vectors.most_similar("queen", topn=5)]
        similarities = [
            s[1] for s in self.vectors.most_similar("queen", topn=5)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar(self):
        keys = [s[0] for s in self.vectors_cs.most_similar("queen")]
        similarities = [s[1] for s in self.vectors_cs.most_similar("queen")]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587, 0.6163408160209656,
                        0.6060680150985718, 0.5923796892166138,
                        0.5908075571060181, 0.5637184381484985
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens',
            u'princess',
            u'king',
            u'monarch',
            u'very_pampered_McElhatton',
            u'Queen',
            u'NYC_anglophiles_aflutter',
            u'Queen_Consort',
            u'princesses',
            u'royal',
        ])

    def test_most_similar_no_similarities(self):
        keys = self.vectors_cs.most_similar("queen", return_similarities=False)
        self.assertEqual(keys, [
            u'queens',
            u'princess',
            u'king',
            u'monarch',
            u'very_pampered_McElhatton',
            u'Queen',
            u'NYC_anglophiles_aflutter',
            u'Queen_Consort',
            u'princesses',
            u'royal',
        ])

    def test_most_similar_top_5(self):
        keys = [s[0] for s in self.vectors_cs.most_similar("queen", topn=5)]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar("queen", topn=5)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar_min_similarity(self):
        keys = [
            s[0]
            for s in self.vectors_cs.most_similar("queen", min_similarity=.63)
        ]
        similarities = [
            s[1]
            for s in self.vectors_cs.most_similar("queen", min_similarity=.63)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar_analogy(self):
        keys = [
            s[0]
            for s in self.vectors_cs.most_similar(positive=["king", "woman"],
                                                  negative=["man"])
        ]
        similarities = [
            s[1]
            for s in self.vectors_cs.most_similar(positive=["king", "woman"],
                                                  negative=["man"])
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7118192315101624, 0.6189674139022827,
                        0.5902431011199951, 0.549946129322052,
                        0.5377321243286133, 0.5236844420433044,
                        0.5235944986343384, 0.518113374710083,
                        0.5098593831062317, 0.5087411403656006
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queen', u'monarch', u'princess', u'crown_prince', u'prince',
            u'kings', u'Queen_Consort', u'queens', u'sultan', u'monarchy'
        ])

    def test_most_similar_cosmul_analogy(self):
        keys = [
            s[0] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"], negative=["man"])
        ]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"], negative=["man"])
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.9314123392105103, 0.858533501625061,
                        0.8476565480232239, 0.8150269985198975,
                        0.809981644153595, 0.8089977502822876,
                        0.8027306795120239, 0.801961362361908,
                        0.8009798526763916, 0.7958389520645142
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queen', u'monarch', u'princess', u'Queen_Consort', u'queens',
            u'crown_prince', u'royal_palace', u'monarchy', u'prince',
            u'empress'
        ])

    def test_most_similar_cosmul_min_similarity_analogy(self):
        keys = [
            s[0] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"],
                negative=["man"],
                min_similarity=.81)
        ]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"],
                negative=["man"],
                min_similarity=.81)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.9314123392105103, 0.858533501625061,
                        0.8476565480232239, 0.8150269985198975
                    ]),
                    atol=.02).all())
        self.assertEqual(keys,
                         [u'queen', u'monarch', u'princess', u'Queen_Consort'])

    def test_closer_than(self):
        self.assertEqual(self.vectors.closer_than("cat", "dog"), ["cats"])

    def test_most_similar_approx(self):
        keys = [
            s[0]
            for s in self.vectors_approx.most_similar_approx("queen", topn=15)
        ]
        similarities = [
            s[1]
            for s in self.vectors_approx.most_similar_approx("queen", topn=15)
        ]
        self.assertEqual(len(keys), 15)
        self.assertTrue(similarities[0] > .7 and similarities[-1] > .5)

    @unittest.expectedFailure
    def test_most_similar_approx_failure(self):
        self.vectors.most_similar_approx("queen", topn=15)

    def test_most_similar_approx_low_effort(self):
        keys = [
            s[0] for s in self.vectors_approx.most_similar_approx(
                "queen", topn=15, effort=.1)
        ]
        self.assertEqual(len(keys), 15)
        self.assertEqual(keys[0], "princess")

    def test_most_similar_analogy_approx(self):
        keys = [
            s[0] for s in self.vectors_approx.most_similar_approx(
                positive=["king", "woman"], negative=["man"], topn=15)
        ]
        self.assertEqual(keys[0], "queen")

    def test_feat_length(self):
        self.vectors_feat_2 = FeaturizerMagnitude(1000, case_insensitive=True)
        self.assertEqual(self.vectors_feat.dim, 4)
        self.assertEqual(self.vectors_feat_2.dim, 5)
        self.vectors_feat_2.close()

    def test_feat_stability(self):
        self.vectors_feat_2 = FeaturizerMagnitude(100, case_insensitive=True)
        self.assertTrue(
            isclose(self.vectors_feat.query("VBG"),
                    self.vectors_feat_2.query("VBG")).all())
        self.assertTrue(
            isclose(self.vectors_feat.query("PRP"),
                    self.vectors_feat_2.query("PRP")).all())
        self.vectors_feat_2.close()

    def test_feat_values(self):
        self.assertTrue(
            isclose(self.vectors_feat.query("VBG")[0], 0.490634876828))
        self.assertTrue(
            isclose(self.vectors_feat.query("PRP")[0], 0.463890807802))
        self.assertTrue(isclose(
            self.vectors_feat.query(5)[0], -0.750681075834))
        self.assertTrue(
            isclose(self.vectors_feat.query(5)[-1], 1.46936807866e-38))
Exemplo n.º 22
0
from pymagnitude import Magnitude
vectors = Magnitude('GoogleNews-vectors-negative300.magnitude')

cat_vector = vectors.query('cat')
print(cat_vector)

print(vectors.similarity("cat", "dog"))
print(vectors.most_similar("cat", topn=100))

def similarity(word1, word2):
    return vectors.similarity(word1, word2)
Exemplo n.º 23
0
class SemanticSpace(object):
    def __init__(self, magnitude_path=None):
        """
        :param str magnitude_path: Path to a .pymagnitude embeddings file.
        """

        self.database = magnitude_path
        if self.database is not None:
            self.embeddings = Magnitude(self.database)

    def _embeddings(self, tokens):
        """
        loads a subset of all embeddings into a DataFrame.

        :param set tokens: set of tokens to get embeddings for

        :return: Dataframe containing embeddings
        :rtype: Dataframe
        """

        tokens = list(set(tokens))
        vectors = [self.embeddings.query(token) for token in tokens]
        df = DataFrame(data=vectors, index=tokens)

        return df

    def generate2d(self, tokens, method='umap'):
        """
        creates 2d-coordinates for a list of tokens

        :param list tokens: list of tokens to generate coordinates for
        :param str method: umap / tsne

        :return: pandas.Dataframe with x and y coordinates
        :rtype: pandas.Dataframe
        """

        # load vectors
        embeddings = self._embeddings(tokens)

        # if no vectors are loaded
        if embeddings.empty:
            return DataFrame()

        # just in case
        embeddings = embeddings.dropna()

        # set up transformer
        if method == 'tsne':
            transformer = TSNE(n_components=2,
                               metric='euclidean',
                               perplexity=10.,
                               verbose=0)

        elif method == 'umap':
            transformer = UMAP()

        else:
            raise NotImplementedError('transformation "%s" not supported' %
                                      method)

        # generate 2d coordinates as data frame
        coordinates = DataFrame(data=transformer.fit_transform(embeddings),
                                index=embeddings.index,
                                columns=['x', 'y'])
        coordinates.index.name = 'item'

        # save coordinates
        self.coordinates = coordinates

        return coordinates

    def add(self, item, cutoff=.2):
        """
        Calculate new coordinates for one embedding, based on cosine similarity.

        :param str item: token to add
        :param float cutoff: cut-off value for cosine similarity

        :return: pandas.Series with ['tsne_x', 'tsne_y', 'user_x', 'user_y'] for
        :rtype: pandas.Series
        """

        # get embedding for items
        item_embedding = self._embeddings([item])[0]

        # gather all similar embeddings
        similarities = []
        for base_embedding in self.coordinates.values:
            similarity = 1 - cosine(item_embedding, base_embedding)

            if similarity >= cutoff:
                similarities.append(similarity)
            else:
                similarities.append(0)

        global_similarity_index = sum(similarities)

        if global_similarity_index == 0:
            # put in global center
            new_coordinates = self.coordinates.sum() / len(self.coordinates)
        else:
            # weighted average
            tmp_coordinates = self.coordinates.apply(
                lambda x: x * similarities)
            new_coordinates = tmp_coordinates.sum() / global_similarity_index

        # append new coordinates
        self.coordinates = self.coordinates.append(
            DataFrame(data={
                'x': new_coordinates['x'],
                'y': new_coordinates['y'],
            },
                      index=[item]))

        return new_coordinates

    def visualize(self, size, title='semantic map', path="/tmp/vis.html"):
        """
        :param Series size: pd.Series containing label sizes
        """

        output_file(path)
        print(self.coordinates.join(size).columns)
        source = ColumnDataSource(self.coordinates.join(size))
        p = figure(title=title)
        p.scatter(x='x', y='y', size=size.name, source=source)
        p.xaxis[0].axis_label = ''
        p.yaxis[0].axis_label = ''
        # coordinate labels = items
        labels = LabelSet(
            x='x',
            y='y',
            text='item',
            level='glyph',
            x_offset=5,  # text_font_size=size.name,
            y_offset=5,
            source=source,
            render_mode='canvas')
        p.add_layout(labels)
        show(p)
        print(p)
Exemplo n.º 24
0
class EmbeddingHolder:
    """
    A utility class to load a pipeline and cache it in memory
    """

    PAD = "<pad>"
    SUPPORTED_EMBEDDINGS = {
        "glove6b": "glove/light/glove.6B.300d",
        "glove6b.50d": "glove/light/glove.6B.50d",
        "w2vnews": "word2vec/light/GoogleNews-vectors-negative300",
        "fasttext": "fasttext/light/wiki-news-300d-1M-subword",
    }
    instances = {}

    def __init__(self, embedding_name):
        """
            If the _is_initialized class property is not set, build the benchmark and model (expensive)
            Else, do nothing.
        """
        self.embedding_name = embedding_name
        self.embedding = Magnitude(
            MagnitudeUtils.download_model(
                self.SUPPORTED_EMBEDDINGS[embedding_name], download_dir=os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir())
            ),
            lazy_loading=-1,
            blocking=True,
        )
        self.stoi = {self.PAD: 0}  # string to integer. Associates an integer value with every token
        self.itos = {0: self.PAD}

    @classmethod
    def get_instance(cls, embedding_name):
        if not cls.instances.get(embedding_name):
            logger.debug("Caching embedding")
            cls.instances[embedding_name] = EmbeddingHolder(embedding_name)

        return cls.instances[embedding_name]

    def get_stoi(self):
        return self.stoi

    def get_itos(self):
        return self.itos

    def get_nvocab(self):
        # I have no idea what nvocab is. TODO: Figure this out - DSSM needs this
        return None

    def create_indexed_embedding_layer_from_tokens(self, tokens):
        """
        For each token in the list of tokens
        1. index = Converts the token into an integer
        2. embedding_for_token = Gets the embedding for the token from self.embedding
        3. creates a tensor where tensor[index] = embedding_for_token

        Why do we need to do this?
        We cannot use the downloaded magnitude embedding directly in a pytorch network. We need to convert into an
        indexed tensor and that is what we're doing here.
        :param tokens: A list of tokens
        :return: A tensor of dimension (len(tokens), self.embedding.dim)
        """
        tokens_minus_padding = [token for token in tokens if token != self.PAD]
        # Removing duplicates. Works only on python 3.7. See https://stackoverflow.com/a/7961390/1841522
        tokens_minus_padding = list(dict.fromkeys(tokens_minus_padding))
        vectors = self.embedding.query(tokens_minus_padding)
        indexed_embedding = np.zeros((len(vectors) + 1, self.embedding.dim), dtype=np.float32)
        indexed_embedding[self.stoi[self.PAD]] = np.zeros(self.embedding.dim)

        for i in range(0, len(vectors)):
            self.stoi[tokens_minus_padding[i]] = i + 1  # i + 1 because i starts from 0, and 0 is reserved for PAD
            self.itos[i + 1] = tokens_minus_padding[i]
            indexed_embedding[i + 1] = vectors[i]

        return indexed_embedding

    def get_index_array_from_tokens(self, tokens, maxlen):
        indices = [self.stoi.get(token, 0) for token in tokens]
        return np.array(padlist(indices, maxlen))
from util import load_pickle, save_pickle

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--vocab_path', type=str, required=True)
    parser.add_argument('--embedding_path', type=str, required=True)
    parser.add_argument('--output_path', type=str, required=True)
    args = parser.parse_args()

    vocab_path = args.vocab_path
    embedding_path = args.embedding_path
    output_path = args.output_path

    print("Loading vocab...")
    vocab = torch.load(vocab_path)
    word2id = vocab['src'].base_field.vocab.stoi
    id2word = vocab['src'].base_field.vocab.itos
    print("vocab size: {0}".format(len(word2id)))

    print("Loading magnitude...")
    word_vectors = Magnitude(embedding_path)
    dim = len(word_vectors.query(id2word[0]))

    print("Building vocab embedding...")
    vocab_embedding = torch.zeros((len(word2id), dim))
    for w, _id in tqdm(word2id.items()):
        vocab_embedding[_id] = torch.from_numpy(word_vectors.query(w))

    # save vocab embedding
    print("Saving vocab embedding...")
    torch.save(vocab_embedding, output_path)
Exemplo n.º 26
0
class EmbeddingEngine:
    """
    An interface to query pre-trained word vectors.
    """

    ABBR_LIST = [
        "C41H11O11", "PV", "OPV", "PV12", "CsOS", "CsKPSV", "CsPS", "CsHIOS",
        "OPV", "CsPSV", "CsOPV", "CsIOS", "BCsIS", "CsPrS", "CEsH", "KP307",
        "AsOV", "CEsS", "COsV", "CNoO", "BEsF", "I2P3", "KP115", "BCsIS",
        "C9705IS", "ISC0501", "B349S", "CISe", "CISSe", "CsIPS", "CEsP",
        "BCsF", "CsFOS", "BCY10", "C12P", "EsHP", "CsHP", "C2K8", "CsOP",
        "EsHS", "CsHS", "C3P", "C50I", "CEs", "CSm", "BF", "EsN", "BN50S",
        "AsCP", "CPo", "LiPb17", "CsS", "EsIS", "AsCU", "CCsHS", "CsHPU",
        "AsOS", "AsCI", "EsF", "FV448", "CNS", "CP5", "AsFP", "EsOP", "NS",
        "NS2", "EsI", "BH", "PPmV", "PSe", "AsN", "OPV5", "NSiW", "CsHHS"
    ]

    def __init__(self,
                 embeddings_source=EMBEDDINGS,
                 out_embeddings_source=OUT_EMBEDDINGS,
                 formulas_source=FORMULAS,
                 phraser_source=PHRASER):
        """

        :param embeddings_source: can be instance of a Magnitude object
        or url or path to a serialized Magnitude object
        :param out_embeddings_source: can be instance of a Magnitude object
        or url or path to a serialized Magnitude object
        :param formulas_source: can be url or path to a JSON-serialized dict
        of formulae, if not supplied a default file is loaded
        """

        # hidden layer embeddings (W)
        self.embeddings = Magnitude(embeddings_source, eager=False)

        # output layer embeddings (O)
        self.out_embeddings = Magnitude(out_embeddings_source)

        # load pre-trained formulas from embeddings
        with open(formulas_source, 'r') as f:
            self.formulas_with_abbreviations = load(f)

        self.dp = DataPreparation(local=False)

        self.es = ElasticConnection()

        self.formulas = {
            k: v
            for k, v in self.formulas_with_abbreviations.items()
            if k not in self.ABBR_LIST
        }

        self.formula_counts = {
            root_formula: sum(formulas.values())
            for root_formula, formulas in self.formulas.items()
        }

        self.most_common_forms = {
            formula_group_name:
            (formula_group_name if formula_group_name in self.dp.ELEMENTS else
             max(formulae.items(), key=operator.itemgetter(1))[0])
            for formula_group_name, formulae in
            self.formulas_with_abbreviations.items()
        }

        self.phraser = Phraser.load(phraser_source)

    def make_phrases(self, sentence, reps=2):
        """
        generates phrases from a sentence of words
        :param sentence: a list of tokens
        :param reps: how many times to combine the words
        :return:
        """
        while reps > 0:
            sentence = self.phraser[sentence]
            reps -= 1
        return sentence

    def prepare_wordphrase(self, wp, im=False):
        """
        Process a string into words and phrases according to existing embeddings
        :param wp: the string to process
        :param im: if True, will ignore missing words, otherwise will generate random vectors
        :return: a list of processed words and phrases
        """
        processed_wp = self.make_phrases(
            self.dp.process_sentence(self.dp.text2sent(wp))[0])
        if im:
            processed_wp = [
                pwp for pwp in processed_wp if pwp in self.embeddings
            ]
        return processed_wp

    def get_embedding(self,
                      wordphrases,
                      ignore_missing=False,
                      normalized=True):
        """
        Gets the embedding for the given word
        :param wordphrases: a string or a list of strings to request embedding for
        :param ignore_missing: if true, will ignore missing words, otherwise will query them
        using pymagnitude defult out of dictionary handling
        :param normalized: if False, returns non-normalized embeddings (True by default)
        :return: an embedding matrix with each row corresponding to a single processed word or phrase
        taken from wordphrases, as well as the lists of processed wordphrases
        """
        def get_single_embedding(wp, im=ignore_missing, norm=normalized):
            """
            Returns a single embedding vector for the given string
            :param wp: a string to get a single embedding for
            :param im: boolen to ignore missing words or return some random vectors if False
            :param norm: if False, returns the non-normalized embedding (True by default)
            :return: a single embedding vector for the string (could be a composite embedding)
            """
            processed_wordphrase = self.prepare_wordphrase(wp, im)

            if len(processed_wordphrase) > 0:
                emb = np.mean(self.embeddings.query(processed_wordphrase,
                                                    normalized=norm),
                              axis=0)
                if norm:
                    emb = emb / np.linalg.norm(emb)
                emb = emb.tolist()
            else:
                emb = [0] * self.embeddings.dim
            return emb, processed_wordphrase

        if not isinstance(wordphrases, list):
            wordphrases = [wordphrases]

        processed_wps = []
        embeddings = []

        try:
            for wordphrase in wordphrases:
                embedding, processed_wp = get_single_embedding(
                    wordphrase, im=ignore_missing)
                processed_wps.append(processed_wp)
                embeddings.append(embedding)
        except Exception as ex:
            warnings.warn(ex)

        return embeddings, processed_wps

    def close_words(self,
                    positive,
                    negative=None,
                    top_k=8,
                    exclude_self=True,
                    ignore_missing=True):
        """
        Returns a list of close words
        :param positive: can be either a string or a list of strings
        :param negative: same as word, but will be treated with a minus sign
        :param top_k: number of close words to return
        :param exclude_self: boolean, if the supplied word should be excluded or not
        :param ignore_missing: ignore words that are missing from the vocabulary
        :return: (words, scores, processed_positive, processed_negative)
        """

        if negative is None:
            negative = []
        else:
            if not isinstance(negative, list):
                negative = [negative]
        processed_negative = []
        for n in negative:
            processed_negative += self.prepare_wordphrase(n, im=ignore_missing)

        if not isinstance(positive, list):
            positive = [positive]
        processed_positive = []
        for p in positive:
            processed_positive += self.prepare_wordphrase(p, im=ignore_missing)

        most_similar = self.embeddings.most_similar(
            processed_positive, negative=processed_negative, topn=top_k)

        if not exclude_self:
            most_similar = [(processed_positive, 1.0)
                            ] + most_similar[:top_k - 1]
        words, scores = map(list, zip(*most_similar))
        return words, [float(s)
                       for s in scores], processed_positive, processed_negative

    def find_similar_materials(self,
                               sentence,
                               n_sentence=None,
                               min_count=3,
                               use_output_emb=True,
                               ignore_missing=True):
        """
        Finds materials that match the best with the context of the sentence
        :param sentence: a list of words
        :param n_sentence: a list of words for a negative context
        :param min_count: the minimum number of occurrences for the formula
        to be included
        :param use_output_emb: if True, use output layer embedding (O) instead of
        inner layer embedding (W)
        :return:
        """
        positive_embeddings, processed_sentence = \
            self.get_embedding(sentence, ignore_missing=ignore_missing)

        n_sentence = n_sentence or []
        negative_embeddings, processed_n_sentence = \
            self.get_embedding(n_sentence, ignore_missing=ignore_missing)

        emb = self.out_embeddings if use_output_emb else self.embeddings

        sum_embedding = np.sum(np.asarray(positive_embeddings), axis=0) - \
                        np.sum(np.asarray(negative_embeddings), axis=0)
        sum_embedding = sum_embedding / np.linalg.norm(sum_embedding)

        # formulas common enough to be above cut-off and that exist in embedding
        formulas = [
            f for f, count in self.formula_counts.items()
            if (count > min_count) and (f in self.embeddings)
        ]

        similarity_scores = np.dot(emb.query(formulas, normalized=True),
                                   sum_embedding)
        similarities = {
            f: float(similarity_scores[i])
            for i, f in enumerate(formulas)
        }

        return sorted(similarities.items(), key=lambda x: x[1],
                      reverse=True), processed_sentence, processed_n_sentence

    def most_common_form(self, formulas):
        """
        Return the most common form of the formula given a list with tuples
        [("normalized formula": score), ...]
        :param formulas: the dictionary
        :return: a list of common forms with counts, [("common form", score, counts in text), ...]
        """
        common_form_score_count = []
        for formula in formulas:
            if formula[0] in self.dp.ELEMENTS:
                most_common_form = formula[0]
            else:
                most_common_form = max(self.formulas[formula[0]].items(),
                                       key=operator.itemgetter(1))[0]
            common_form_score_count.append(
                (most_common_form, formula[1],
                 sum(self.formulas[formula[0]].values())))
        return common_form_score_count

    def filter_by_elements(self,
                           formulas,
                           plus_elems=None,
                           minus_elems=None,
                           max=50):
        """
        Filter formulas according to the following rule: It has to have one of the plus_elements (if None all work),
        but it cannot have any of the minus_elems. If there is an overlap, the element is ignored
        :param formulas: a list of (formula, score) tuples
        :param plus_elems: the formula has to have at least one of these
        :param minus_elems: but cannot have any of these
        :param max: maximum number to return
        :return:
        """
        plus_elems = plus_elems or []
        minus_elems = minus_elems or []
        plus_elems, minus_elems = set(plus_elems) - set(minus_elems), set(
            minus_elems) - set(plus_elems)

        def has_plus(comp, pe):
            if pe is None or len(pe) == 0:
                return True
            for elem in comp:
                if elem in pe:
                    return True
            return False

        def has_minus(comp, me):
            if me is None or len(me) == 0:
                return False
            for elem in comp:
                if elem in me:
                    return True
            return False

        matched = 0
        matched_formula = []
        for form in formulas:
            composition = self.dp.parser.parse_formula(form[0])
            if has_plus(composition, plus_elems) and not has_minus(
                    composition, minus_elems):
                matched_formula.append(form)
                matched += 1
            if matched >= max:
                return matched_formula
        return matched_formula

    def mentioned_with(self, material, words):
        """
        Returns True if the supplied material was mentioned with any of the words in any of the abstracts. This is a
        very strict text search and is aimed at high recall. This method is used for discovery so having higher recall
        might hinder some discoveries but will avoid too many false positives. E.g. for material=CuTe and
        words=["thermoelectric"], "CuTe2 is thermoelectric" will return True since "CuTe" will be matched with "CuTe2"
        in text search. The word search is exact, so if the keyword was "thermo" it would not match "thermoelectric".
        :param material: A material formula (does not have to be normalized)
        :param words: List of processed words and phrases (words separated by "_") to search the text for co-occurrences
        :return: True if the material is mentioned with any of the words, False otherwise
        """
        norm_material = self.dp.get_norm_formula(
            material) if self.dp.is_simple_formula(material) else material

        # different ways the material is written
        variations = self.formulas[
            norm_material] if norm_material in self.formulas else [
                norm_material
            ]
        variations = "(" + " OR ".join(variations) + ")"
        targets = "(" + " OR ".join(words) + ")"
        query = "{} AND {}".format(targets, variations)
        if self.es.count_matches(query) > 0:
            return True
        else:
            return False