Exemplo n.º 1
0
    def test_oov_subword_values(self):
        self.vectors_oov_1 = Magnitude(
            MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
            case_insensitive=True,
            ngram_oov=False,
            eager=False)
        self.vectors_oov_2 = Magnitude(
            MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
            case_insensitive=True,
            ngram_oov=False,
            eager=False)

        self.assertTrue(isclose(self.vectors_oov_1.query("discriminatoryy")[0],
                                -0.059116619334669426))
        self.assertTrue(isclose(self.vectors_oov_1.query("*<")[0],
                                -0.0759614511397))
        self.assertTrue(isclose(self.vectors_oov_1.query("*<<")[0],
                                0.00742723997271))
        self.assertTrue(isclose(self.vectors_oov_1.query("uberx")[0],
                                0.0952671681336))
        self.assertTrue(isclose(self.vectors_oov_1.query("misssipi")[0],
                                0.0577835297955))
        self.assertTrue(isclose(self.vectors_oov_2.query("discriminatoryy")[0],
                                -0.059116619334669426))
        self.assertTrue(isclose(self.vectors_oov_2.query("*<")[0],
                                -0.0759614511397))
        self.assertTrue(isclose(self.vectors_oov_2.query("*<<")[0],
                                0.00742723997271))
        self.assertTrue(isclose(self.vectors_oov_2.query("uberx")[0],
                                0.0952671681336))
        self.assertTrue(isclose(self.vectors_oov_2.query("misssipi")[0],
                                0.0577835297955))

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()
Exemplo n.º 2
0
def extract_wordvec_generalization(word, path_to_word_vectors,
                                   neighbor_number):
    ''' Extracts the nearest neighbor from vector space '''
    vectors = Magnitude(path_to_word_vectors)
    generalized_attribute = vectors.most_similar(
        word, topn=neighbor_number)[neighbor_number - 1][0]
    return generalized_attribute
Exemplo n.º 3
0
def glove_via_magnitude(topn=500,
                        min_similarity=None,
                        filename='glove.6B.100d.magnitude',
                        lang='en_US'):

    from pymagnitude import Magnitude

    v = Magnitude(os.path.join(TOPDIR, filename))
    training_set = list()
    units = set()
    for unit_list in classifier.ambiguous_units():
        for unit in unit_list[1]:
            units.add(unit)
    for unit in units:
        print('Processing {}...'.format(unit.name))

        name = unit.name
        surfaces = set(unit.name)
        if isinstance(unit, classes.Unit):
            surfaces.update(unit.surfaces)
            surfaces.update(unit.symbols)
        for surface in surfaces:
            neighbours = v.most_similar(
                v.query(surface), topn=topn, min_similarity=min_similarity)
            training_set.append({
                'unit':
                name,
                'text':
                ' '.join(neighbour[0] for neighbour in neighbours)
            })
    print('Done')

    with language.topdir(lang).joinpath('train/similars.json').open(
            'w', encoding='utf-8') as file:
        json.dump(training_set, file, sort_keys=True, indent=4)
Exemplo n.º 4
0
def create_vocab_tensors(input_vocab_index):
    """Creates a matrix of the glove embeddings for terms contained in the model for improve runtime
        Also used in ESIM"""
    print('Creating vocabulary tensors...')
    # Define GloVe model from Magnitude package
    model = Magnitude(config.glove_magnitude_path)

    np.random.seed(config.SEED)
    # Randomly initialize matrix
    vocab_tensors = np.random.normal(
        0, 1, (input_vocab_index.n_words, model.dim)).astype('float32')

    vocab_words = list(input_vocab_index.word2index.keys())
    unk_words = []

    # Get vector for each word in vocabulary if in model
    for idx, word in enumerate(vocab_words):
        if word in model:
            vocab_tensors[idx] = model.query(word)
        else:
            unk_words.append(word)

    # Override special tokens
    special_tokens = ['SOS', 'EOS', 'UNK']

    # Override special tokens
    vocab_tensors[:len(special_tokens), :] = np.random.uniform(
        -0.1, 0.1, (len(special_tokens), model.dim)).astype('float32')

    print('Tensor vocabulary complete.')
    print('    Total vocabulary size {}, {} UNK words ({:.2}%)'.format(
        len(vocab_words), len(unk_words),
        (len(unk_words) / len(vocab_words)) * 100))
    return torch.tensor(vocab_tensors, dtype=torch.float64), unk_words
Exemplo n.º 5
0
 def test_lang_none_oov_stem(self):
     self.vectors_l = Magnitude(MagnitudeTest.MAGNITUDE_PATH, language=None)
     self.assertEqual(self.vectors_l._oov_stem('rejumping'), 'rejumping')
     self.assertEqual(
         self.vectors_l._oov_stem('reuberificationing'),
         'reuberificationing')
     self.vectors_l.close()
Exemplo n.º 6
0
    def test_ngram_oov_subword_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()
Exemplo n.º 7
0
def predict(chain, embedding=False, interpolation=False):
    if embedding or interpolation:
        vectors = Magnitude('GoogleNews-vectors-negative300.magnitude')

    scores = dict()
    for verb in verbs:
        score = 0
        for event in chain:
            if embedding: score += vectors.similarity(event[0], verb)
            elif interpolation:
                score += (ALPHA * vectors.similarity(event[0], verb) +
                          (1 - ALPHA) * pmi(event, (verb, None, None)))
            else:
                score += pmi(event, (verb, None, None))
        scores[verb] = score

    cleaned_scores = dict()
    chain_verbs = set()
    for event in chain:
        chain_verbs.add(event)

    for candidate in scores:
        if candidate not in chain_verbs:
            cleaned_scores[candidate] = scores[candidate]

    ranked_scores = sorted(list(cleaned_scores.items()),
                           key=lambda x: x[1],
                           reverse=True)
    return ranked_scores
Exemplo n.º 8
0
def get_nearest_words():
    """
    provides words closely related to the keywords

    Parameters:
      keywords -- an array of words closely related to the concept
    Returns:
      closest_words -- these are displayed on the right panel of the concept screen
    Testing:
      http://localhost:3001/api/get_nearest_words?keywords=lunch,slice,pie,pasta
  """
    keywords = request.args.get('keywords', '')

    from pymagnitude import Magnitude
    #vectors = Magnitude('http://magnitude.plasticity.ai/word2vec/heavy/GoogleNews-vectors-negative300.magnitude', stream=True) # full url for streaming from 10GB model
    #vectors = Magnitude('http://magnitude.plasticity.ai/glove/light/glove.6B.50d.magnitude', stream=True)
    vectors = Magnitude('./pretrained_features/glove.6B.50d.magnitude')

    # there is likely overlap if the concepts words are closely related
    closest_words = set()
    for k in keywords.split(','):
        results = vectors.most_similar(k, topn=10)  # Most similar by key
        #vectors.most_similar(vectors.query(k), topn = 100) # Most similar by vector
        for r in results:
            # just add the word, not the word's probability
            closest_words.add(r[0])
    closest_words = closest_words - set(list(keywords.split(',')))
    return json.dumps(list(closest_words))
Exemplo n.º 9
0
def get_word_vector(word):
    global model
    if model is None:
        # import fasttext
        # if os.environ.get('LANGUAGE', 'en').lower() == 'en':
        #     print('Loading English word vectors')
        #     model = fasttext.load_model('data/cc.en.300.bin')
        # else:
        #     print('Loading Vietnamese word vectors')
        #     model = fasttext.load_model('data/cc.vi.300.bin')

        # return model.get_word_vector(word.replace(' ', '_'))

        from pymagnitude import Magnitude

        if os.environ.get('LANGUAGE', 'en').lower() == 'en':
            print('Loading English word vectors')
            model = Magnitude('data/cc.en.300.magnitude',
                              language='en',
                              lazy_loading=20000)
        else:
            print('Loading Vietnamese word vectors')
            model = Magnitude('data/cc.vi.300.magnitude',
                              language='vi',
                              lazy_loading=20000)

        print('Loading completed')

    return model.query(word)
Exemplo n.º 10
0
 def test_list(self):
     self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                   case_insensitive=True,
                                   use_numpy=False,
                                   eager=False)
     self.assertTrue(isinstance(self.vectors_list.query("cat"), list))
     self.vectors_list.close()
Exemplo n.º 11
0
    def __init__(self, magnitude_path=None):
        """
        :param str magnitude_path: Path to a .pymagnitude embeddings file.
        """

        self.database = magnitude_path
        if self.database is not None:
            self.embeddings = Magnitude(self.database)
Exemplo n.º 12
0
 def test_oov_subword_dim_placeholders(self):
     self.vectors_placeholders = Magnitude(
         MagnitudeTest.MAGNITUDE_SUBWORD_PATH, placeholders=5,
         case_insensitive=True, eager=False)
     self.assertEqual(self.vectors_placeholders.query("*<<<<").shape,
                      self.vectors_placeholders.query("cat").shape)
     self.assertTrue(isclose(self.vectors.query("*<<<<")[0],
                             self.vectors_placeholders.query("*<<<<")[0]))
     self.vectors_placeholders.close()
Exemplo n.º 13
0
def get_simlex_and_metrics():
    simlex_data = load_simlex_data('../data/MSimLex999_Polish.txt')
    euklidean_metric = EuclideanMetric(
        Magnitude(
            '../data/nkjp+wiki-lemmas-restricted-300-skipg-ns.magnitude'))
    cosine_metric = CosineMetric(
        Magnitude(
            '../data/nkjp+wiki-lemmas-restricted-300-skipg-ns.magnitude'))
    return simlex_data, euklidean_metric, cosine_metric
Exemplo n.º 14
0
 def test_placeholders(self):
     self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                           case_insensitive=True,
                                           placeholders=5,
                                           eager=False)
     self.assertEqual(self.vectors_placeholders.query("cat").shape, (305, ))
     self.assertEqual(
         self.vectors_placeholders.query("cat")[0],
         self.vectors.query("cat")[0])
     self.vectors_placeholders.close()
Exemplo n.º 15
0
def put_embeddings(self, rSubmission: RedditSubmission):
    vecs = Magnitude('word2vec/light/GoogleNews-vectors-negative300')

    rSubmission.post_title_embedding = np.mean(vecs.query(
        rSubmission.post_title.split()),
                                               axis=0)

    if len(rSubmission.post_text) > 0:
        rSubmission.post_text_embedding = np.mean(vecs.query(
            rSubmission.post_text.split()),
                                                  axis=0)

    return rSubmission
Exemplo n.º 16
0
def test_embedtext_creation():
    extractor_cfg = {
        "_name": "embedtext",
        "index": "anserini",
        "tokenizer": "anserini",
        "embeddings": "glove6b",
        "zerounk": True,
        "calcidf": True,
        "maxqlen": MAXQLEN,
        "maxdoclen": MAXDOCLEN,
    }
    extractor = EmbedText(extractor_cfg)

    benchmark = DummyBenchmark({"_fold": "s1", "rundocsonly": False})
    collection = DummyCollection({"_name": "dummy"})

    index_cfg = {"_name": "anserini", "indexstops": False, "stemmer": "porter"}
    index = AnseriniIndex(index_cfg)
    index.modules["collection"] = collection

    tok_cfg = {"_name": "anserini", "keepstops": True, "stemmer": "none"}
    tokenizer = AnseriniTokenizer(tok_cfg)

    extractor.modules["index"] = index
    extractor.modules["tokenizer"] = tokenizer

    qids = list(benchmark.qrels.keys())  # ["301"]
    qid = qids[0]
    docids = list(benchmark.qrels[qid].keys())

    extractor.create(qids, docids, benchmark.topics[benchmark.query_type])

    expected_vocabs = [
        "lessdummy", "dummy", "doc", "hello", "greetings", "world", "from",
        "outer", "space", "<pad>"
    ]
    expected_stoi = {s: i for i, s in enumerate(expected_vocabs)}

    assert set(extractor.stoi.keys()) == set(expected_stoi.keys())

    emb_path = "glove/light/glove.6B.300d"
    fullemb = Magnitude(MagnitudeUtils.download_model(emb_path))
    assert extractor.embeddings.shape == (len(expected_vocabs), fullemb.dim)

    for i in range(extractor.embeddings.shape[0]):
        if i == extractor.pad:
            assert extractor.embeddings[i].sum() < 1e-5
            continue
        s = extractor.itos[i]
        assert (extractor.embeddings[i] - fullemb.query(s)).sum() < 1e-5
    return extractor
Exemplo n.º 17
0
 def test_list_multiple(self):
     self.vectors_list = Magnitude(
         MagnitudeTest.MAGNITUDE_PATH,
         case_insensitive=True,
         use_numpy=False,
         eager=False)
     q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
     self.assertTrue(isinstance(self.vectors_list.query(q[0]), list))
     self.assertTrue(isclose(self.vectors.query(q[0]),
                             asarray(self.vectors_list.query(q[0]))).all())
     self.assertTrue(isinstance(self.vectors_list.query(q), list))
     self.assertTrue(isclose(self.vectors.query(q),
                             asarray(self.vectors_list.query(q))).all())
     self.vectors_list.close()
Exemplo n.º 18
0
    def __init__(self,
                 embeddings_source=EMBEDDINGS,
                 out_embeddings_source=OUT_EMBEDDINGS,
                 formulas_source=FORMULAS,
                 phraser_source=PHRASER):
        """

        :param embeddings_source: can be instance of a Magnitude object
        or url or path to a serialized Magnitude object
        :param out_embeddings_source: can be instance of a Magnitude object
        or url or path to a serialized Magnitude object
        :param formulas_source: can be url or path to a JSON-serialized dict
        of formulae, if not supplied a default file is loaded
        """

        # hidden layer embeddings (W)
        self.embeddings = Magnitude(embeddings_source, eager=False)

        # output layer embeddings (O)
        self.out_embeddings = Magnitude(out_embeddings_source)

        # load pre-trained formulas from embeddings
        with open(formulas_source, 'r') as f:
            self.formulas_with_abbreviations = load(f)

        self.dp = DataPreparation(local=False)

        self.es = ElasticConnection()

        self.formulas = {
            k: v
            for k, v in self.formulas_with_abbreviations.items()
            if k not in self.ABBR_LIST
        }

        self.formula_counts = {
            root_formula: sum(formulas.values())
            for root_formula, formulas in self.formulas.items()
        }

        self.most_common_forms = {
            formula_group_name:
            (formula_group_name if formula_group_name in self.dp.ELEMENTS else
             max(formulae.items(), key=operator.itemgetter(1))[0])
            for formula_group_name, formulae in
            self.formulas_with_abbreviations.items()
        }

        self.phraser = Phraser.load(phraser_source)
Exemplo n.º 19
0
def read_magnitude_vectors(magnitude_filepath,
                           vocab_filepath,
                           vocab_size,
                           dim,
                           special_tokens=[UNK]):
    """Read word vectors from *.magnitude

    Args:
        magnitude_filepath (str): magnituide file path
        vocab_filepath (str): vocabulary file path
        vocab_size (int): Maximum vocab size (including special tokens)
        dim (int): Dimension of the word vectors to load
        special_tokens (list[str])

    Return:
        words (list[str]): list of length vocab_size
        embeddings (np.array): (vocab_size, dim)
    """
    logging.info('Loading word vectors from %s', magnitude_filepath)
    words = [x for x in special_tokens]
    word_set = set()
    with open(vocab_filepath, 'r', 'utf8') as fin:
        for line in fin:
            word = line.strip().split("\t")[0]
            if word in word_set:
                logging.warning(
                    "token must be unique. non-unique token='{}'".format(word))
            elif len(word) > 0:
                word_set.add(word)
                words.append(word)
                if len(words) == vocab_size:
                    break
    magnitude = Magnitude(magnitude_filepath,
                          case_insensitive=True,
                          normalized=True)
    vectors = magnitude.query(words[len(special_tokens):])
    # special vectors for UNK
    special_vectors = np.random.normal(size=(len(special_tokens), dim))
    special_vectors /= np.linalg.norm(special_vectors,
                                      ord=2,
                                      axis=1,
                                      keepdims=True)
    # Concatenate
    vectors = np.vstack([special_vectors, vectors]).astype('float32')
    assert vectors.shape[0] == len(words)
    assert vectors.shape[1] == dim
    logging.info('Loaded %d word vectors; shape = %s', len(words),
                 str(vectors.shape))
    return words, vectors
Exemplo n.º 20
0
 def _build_matrix(self, tokenizer):
     vector = Magnitude('vectors/glove.6B.50d.magnitude')
     GLOVE_VECTOR_DIMENSION = 50
     MAX_NUM_WORDS = 300
     word_index = tokenizer.word_index
     num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
     embedding_matrix = np.zeros((num_words, GLOVE_VECTOR_DIMENSION))
     for word, i in tqdm(word_index.items()):
         if i > MAX_NUM_WORDS:
             continue
         embedding_vector = vector.query(word)
         if embedding_vector is not None:
             # words not found in embedding index will be all-zeros.
             embedding_matrix[i] = embedding_vector
     return embedding_matrix
Exemplo n.º 21
0
 def __init__(self, embedding_name):
     """
         If the _is_initialized class property is not set, build the benchmark and model (expensive)
         Else, do nothing.
     """
     self.embedding_name = embedding_name
     self.embedding = Magnitude(
         MagnitudeUtils.download_model(
             self.SUPPORTED_EMBEDDINGS[embedding_name], download_dir=os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir())
         ),
         lazy_loading=-1,
         blocking=True,
     )
     self.stoi = {self.PAD: 0}  # string to integer. Associates an integer value with every token
     self.itos = {0: self.PAD}
Exemplo n.º 22
0
class MagnitudeFactory(EmbeddingFactory):
    def __init__(self, embedding_type: EmbeddingType):
        super().__init__(embedding_type)

        cache_dir = Path(fs.get_project_root_dir()) / ".magnitude"
        fs.mkdir_if_not_exists(cache_dir)
        embed_file = self._embedding_type.url[self._embedding_type.url.
                                              rfind("/") + 1:]
        compressed_file = Path(cache_dir) / embed_file
        if not compressed_file.exists():
            logger.info(
                '  Downloading magnitude file ("{}")...'.format(embed_file))
            wget.download(self._embedding_type.url, compressed_file)

        self._embed_file = compressed_file
        logger.info('  Loading Magnitude module...')
        self._magnitude_vecs = Magnitude(self._embed_file)

    def build(self, vocab_list: List[str],
              **kwargs) -> (List[str], List[str], Dict[str, Dict[str, Any]]):
        oov, iov = [], []

        vec_dict = {}
        for w in vocab_list:
            is_oov = w not in self._magnitude_vecs
            vec = self._magnitude_vecs.query(w)
            vec_dict[w] = {"vec": vec, "trainable": is_oov}

            if is_oov:
                oov.append(w)
            else:
                iov.append(w)

        return oov, iov, vec_dict
Exemplo n.º 23
0
def load_data(data_dir='./data'):
    """Loads all data in `data_dir` as a dict

  Each of `dev`, `train` and `test` contains (1) `raw` folder (2)
    `relations.json`. We don't need to worry about `raw` folder, and instead
    focus on `relations.json` which contains all the information we need for our
    classification task.

  Args:
    data_dir: str, the root directory of all data

  Returns:
    dict, where the keys are: `dev`, `train` and `test` and the values are lists
      of relations data in `relations.json`
  """
    assert os.path.exists(data_dir), "`data_dir` does not exist in `load_data`"

    data = {}
    vectors = Magnitude("glove.6B.50d.magnitude")
    #vectors = Magnitude("glove.6B.300d.magnitude")
    get_sense_dict(os.path.join(data_dir, "train"))
    #print(sense_dict)

    for folder in os.listdir(data_dir):
        #print(folder)
        print("Loading", folder)
        folder_path = os.path.join(data_dir, folder)
        #print(folder_path)
        data[folder] = load_relations(folder_path, vectors)
    '''
  print("Loading", "dev")
  folder_path = os.path.join(data_dir, "dev")
  data["dev"] = load_relations(folder_path, vectors)
  '''
    return data
Exemplo n.º 24
0
class GloveEncoder():
    """Encodes an input sentence as a mean or max pooled sentence embedding given the individual word embeddings"""
    def __init__(self, pooling='mean'):
        self.name = 'GloveEncoder'
        self.trainable_model = False
        self.pooling = pooling
        self.model = Magnitude(config.glove_magnitude_path)
        self.hidden_size = self.model.dim

    def sentence_embedding(self, input_text):
        words_in_model = [
            word for word in input_text.split() if word in self.model
        ]
        sentence_embedding = np.zeros((len(words_in_model), self.model.dim))
        sentence_embedding.fill(np.nan)

        for idx, token in enumerate(words_in_model):
            sentence_embedding[idx] = self.model.query(token)

        if self.pooling == 'max':
            sentence_embedding = np.max(sentence_embedding, axis=0)

        else:
            sentence_embedding = np.mean(sentence_embedding, axis=0)

        return torch.tensor(sentence_embedding.reshape(1, 1, -1),
                            device=DEVICE)
Exemplo n.º 25
0
    def load(self, path, blocking):
        # Require that vector path exists, if a path is provided and it's not found, Magnitude will try download from it's servers
        if not path or not os.path.isfile(path):
            raise IOError(ENOENT, "Vector model file not found", path)

        # Load magnitude model. If this is a training run (no embeddings yet), block until the vectors are fully loaded
        return Magnitude(path, case_insensitive=True, blocking=blocking)
Exemplo n.º 26
0
class MagnitudeFactory(EmbeddingFactory):
    def __init__(self, embedding_type: EmbeddingType):
        super().__init__(embedding_type)

        cache_dir = Path(fs.get_project_root_dir()) / ".magnitude"
        fs.mkdir_if_not_exists(cache_dir)
        embed_file = self._embedding_type.url[self._embedding_type.url.
                                              rfind("/") + 1:]
        compressed_file = Path(cache_dir) / embed_file
        if not compressed_file.exists():
            logger.info(
                '  Downloading magnitude file ("{}")...'.format(embed_file))
            wget.download(self._embedding_type.url, compressed_file)

        self._embed_file = compressed_file
        logger.info('  Loading Magnitude module...')
        self._magnitude_vecs = Magnitude(self._embed_file)

    def build(self, vocab_list: List[str], h5_file: Path,
              **kwargs) -> (List[str], List[str]):
        oov, iov = [], []
        with h5py.File(h5_file, mode="w") as vec_h5:
            for w in vocab_list:
                is_oov = w not in self._magnitude_vecs
                vec = self._magnitude_vecs.query(w)
                vec_h5.create_dataset("{key}/vec".format(key=w), data=vec)
                vec_h5.create_dataset("{key}/trainable".format(key=w),
                                      data=1 if is_oov else 0)

                if is_oov:
                    oov.append(w)
                else:
                    iov.append(w)

        return oov, iov
Exemplo n.º 27
0
    def __init__(self, emdim):

        base_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'data')

        self.fasttext_dim = 300
        self.glove_dim = emdim - 300

        assert self.glove_dim in [50, 100, 200,
                                  300], "Embedding dimension must be one of the following: 350, 400, 500, 600"

        print("Will download magnitude files from the server if they aren't avaialble locally.. So, grab a cup of coffee while the downloading is under progress..")
        glove = Magnitude(MagnitudeUtils.download_model('glove/medium/glove.6B.{}d'.format(self.glove_dim),
                                                        download_dir=os.path.join(base_dir, 'magnitude')), case_insensitive=True)
        fasttext = Magnitude(MagnitudeUtils.download_model('fasttext/medium/wiki-news-300d-1M-subword',
                                                           download_dir=os.path.join(base_dir, 'magnitude')), case_insensitive=True)
        self.vectors = Magnitude(glove, fasttext)
Exemplo n.º 28
0
def put_embeddings(self, rSubmission: RedditSubmission):
    vecs = Magnitude('word2vec/light/GoogleNews-vectors-negative300')
    # vecs = Magnitude('http://magnitude.plasticity.ai/word2vec/light/GoogleNews-vectors-negative300.magnitude')

    rSubmission.post_title_embedding = np.mean(vecs.query(
        rSubmission.post_title.split()),
                                               axis=0)

    if len(rSubmission.post_text) > 0:
        rSubmission.post_text_embedding = np.mean(vecs.query(
            rSubmission.post_text.split()),
                                                  axis=0)

    logger.info('Embedded submission: ', rSubmission.post_title)

    return rSubmission
Exemplo n.º 29
0
def create_magnitude(case_insensitive=True, eager=False, **kwargs):
    vectors = Magnitude(
        MAGNITUDE_PATH,
        case_insensitive=case_insensitive,
        eager=eager,
        **kwargs)
    return vectors
Exemplo n.º 30
0
    def __init__(self, embedding_type: EmbeddingType):
        super().__init__(embedding_type)

        cache_dir = Path(fs.get_project_root_dir()) / ".magnitude"
        fs.mkdir_if_not_exists(cache_dir)
        embed_file = self._embedding_type.url[self._embedding_type.url.
                                              rfind("/") + 1:]
        compressed_file = Path(cache_dir) / embed_file
        if not compressed_file.exists():
            logger.info(
                '  Downloading magnitude file ("{}")...'.format(embed_file))
            wget.download(self._embedding_type.url, compressed_file)

        self._embed_file = compressed_file
        logger.info('  Loading Magnitude module...')
        self._magnitude_vecs = Magnitude(self._embed_file)