Python _fetch_file示例，web.datasets.utils._fetch_file Python示例

示例#1

0

显示文件

def fetch_conceptnet_numberbatch(clean_words=False):
    """
    Fetches ConceptNetNumberbatch embeddings. Embeddings are normalized to unit length,
    and the vocabulary terms are lowercase.

    Parameters
    ----------
    clean_words: bool, default: False
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Published at https://github.com/commonsense/conceptnet-numberbatch
    Reference paper: Robert Speer, Joshua Chin, and Catherine Havasi (2017). "ConceptNet 5.5: An Open Multilingual Graph of General Knowledge." In proceedings of AAAI 2017.
    """
    path = _fetch_file(
        url=
        'https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz',
        data_dir='embeddings',
        uncompress=False,
        verbose=1)
    return load_embedding(path,
                          format='word2vec',
                          normalize=False,
                          clean_words=clean_words)

示例#2

0

显示文件

def test_categorization():
    data = fetch_ESSLI_2c()
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    assert evaluate_categorization(w, data.X, data.y, seed=777,
                                   method="all") >= 0.2

示例#3

0

显示文件

文件： test_analogy.py 项目： avsilva/sparse-nlp

def test_analogy_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    data = fetch_google_analogy()
    ids = np.random.RandomState(777).choice(range(data.X.shape[0]), 1000, replace=False)
    X, y = data.X[ids], data.y[ids]
    category = data.category_high_level[ids]

    results = evaluate_analogy(w=w, X=X, y=y, category=category)
    assert results['accuracy']['all'] >= 0.65
    assert results['accuracy']['semantic'] >= 0.7
    assert results['accuracy']['syntactic'] >= 0.63

    results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul")
    assert results['accuracy']['all'] >= 0.7
    assert results['accuracy']['semantic'] >= 0.75
    assert results['accuracy']['syntactic'] >= 0.64

    results_mul = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul", k=400)
    results_add = evaluate_analogy(w=w, X=X, y=y, category=category, method="add", k=400)
    assert results_mul['accuracy']['all'] >= results_add['accuracy']['all']
    assert results_mul['accuracy']['syntactic'] >= results_add['accuracy']['syntactic']
    assert results_mul['accuracy']['semantic'] >= results_add['accuracy']['semantic']

示例#4

0

显示文件

def fetch_HDC(dim=300, normalize=True, lower=False, clean_words=False):
    """
    Fetches PDC embeddings trained on wiki by Fei Sun

    Parameters
    ----------
    dim: int, default:300
      Dimensionality of embedding

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Embedding instance

    References
    ----------
    Embeddings were published on http://ofey.me/projects/wordrep/.
    Reference paper: Fei Sun, Jiafeng Guo, Yanyan Lan, Jun Xu, and Xueqi Cheng.
    "Learning word representations by jointly modeling syntagmatic and paradigmatic relations"
    """

    url = {
        50:
        "https://www.dropbox.com/s/q22ssy8055loknz/wikicorp.201004-hdc-"
        "iter-20-alpha-0.025-window-10-dim-50-neg-10-subsample-0.0001.txt.bz2?dl=1",
        100:
        "https://www.dropbox.com/s/13226et55fi6g50/wikicorp.201004-hdc-"
        "iter-20-alpha-0.025-window-10-dim-100-neg-10-subsample-0.0001.txt.bz2?dl=1",
        300:
        "https://www.dropbox.com/s/jrfwel32yd8w0lu/wikicorp.201004-hdc-"
        "iter-20-alpha-0.025-window-10-dim-300-neg-10-subsample-0.0001.txt.bz2?dl=1"
    }
    assert dim in url, "Unavailable dimensionality"

    path = _fetch_file(url=url[dim],
                       data_dir="embeddings",
                       uncompress=False,
                       move="hdc/hdc{}.txt.bz2".format(dim),
                       verbose=1)

    return load_embedding(path,
                          format="word2vec",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)

示例#5

0

显示文件

文件： test_similarity.py 项目： zhyq/word-embeddings-benchmarks

def test_similarity():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    data = fetch_SimLex999()

    result_1 = evaluate_similarity(w, data.X, data.y)
    result_2 =  evaluate_similarity(dict(zip(w.vocabulary.words, w.vectors)), data.X, data.y)

    assert result_2 > 0
    assert result_1 == result_2, "evaluate_similarity should return same result for dict and Embedding instance"

示例#6

0

显示文件

def fetch_LexVec(which="commoncrawl-W+C",
                 normalize=True,
                 lower=False,
                 clean_words=False):
    """
    Fetches LexVec embeddings

    Parameters
    ----------
    which: str, default: "commoncrawl-W+C"
      Can choose between "commoncrawl-W", "commoncrawl-W+C", "wikipedia+newscrawl-W", "wikipedia+newscrawl-W+C", "commoncrawl-ngramsubwords-W"

    normalize: bool, default: True
      If true will normalize all vector to unit length

    lower: bool, default: False
      If true, will convert string to lowercase

    clean_words: bool, default: False
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Published at https://github.com/alexandres/lexvec
    Reference paper: Salle, Alexandre, Marco Idiart, and Aline Villavicencio. Matrix Factorization using Window Sampling and Negative Sampling for Improved Word Representations. The 54th Annual Meeting of the Association for Computational Linguistics. 2016.
    """
    download_file = {
        "commoncrawl-W":
        "https://www.dropbox.com/s/flh1fjynqvdsj4p/lexvec.commoncrawl.300d.W.pos.vectors.gz?dl=1",
        "commoncrawl-W+C":
        "https://www.dropbox.com/s/zkiajh6fj0hm0m7/lexvec.commoncrawl.300d.W%2BC.pos.vectors.gz?dl=1",
        "wikipedia+newscrawl-W":
        "https://www.dropbox.com/s/kguufyc2xcdi8yk/lexvec.enwiki%2Bnewscrawl.300d.W.pos.vectors.gz?dl=1",
        "wikipedia+newscrawl-W+C":
        "https://www.dropbox.com/s/u320t9bw6tzlwma/lexvec.enwiki%2Bnewscrawl.300d.W%2BC.pos.vectors.gz?dl=1",
        "commoncrawl-ngramsubwords-W":
        "https://www.dropbox.com/s/mrxn933chn5u37z/lexvec.commoncrawl.ngramsubwords.300d.W.pos.vectors.gz?dl=1"
    }

    path = _fetch_file(url=download_file[which],
                       data_dir="embeddings",
                       verbose=1)

    return load_embedding(path,
                          format="word2vec",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)

示例#7

0

显示文件

文件： test_similarity.py 项目： avsilva/sparse-nlp

def test_similarity_norm():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    w_norm = w.normalize_words()
    data = fetch_SimLex999()

    result_1 = evaluate_similarity(w, data.X, data.y)
    result_2 = evaluate_similarity(w_norm, data.X, data.y)

    assert result_2 > 0
    assert result_1 == result_2, "evaluate_similarity should return same result for normalized and unnormalized words"

示例#8

0

显示文件

def test_similarity_norm():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    w_norm = w.normalize_words()
    data = fetch_SimLex999()

    result_1 = evaluate_similarity(w, data.X, data.y)
    result_2 = evaluate_similarity(w_norm, data.X, data.y)

    assert result_2 > 0
    assert result_1 == result_2, "evaluate_similarity should return same result for normalized and unnormalized words"

示例#9

0

显示文件

文件： test_embedding.py 项目： avsilva/sparse-nlp

def test_save():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)

    dirpath = tempfile.mkdtemp()
    w.to_word2vec(w, path.join(dirpath, "tmp.bin"), binary=True)
    w.to_word2vec(w, path.join(dirpath, "tmp.txt"), binary=False)
    w2 = Embedding.from_word2vec(path.join(dirpath, "tmp.bin"), binary=True)
    w3 = Embedding.from_word2vec(path.join(dirpath, "tmp.txt"), binary=False)
    assert np.array_equal(w.vectors, w2.vectors)
    assert w.vocabulary.words == w2.vocabulary.words
    assert np.sum(np.abs(w.vectors - w3.vectors)) < 1e-5
    assert w.vocabulary.words == w3.vocabulary.words

示例#10

0

显示文件

文件： test_embedding.py 项目： avsilva/sparse-nlp

def test_save():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)

    dirpath = tempfile.mkdtemp()
    w.to_word2vec(w, path.join(dirpath, "tmp.bin"), binary=True)
    w.to_word2vec(w, path.join(dirpath, "tmp.txt"), binary=False)
    w2 = Embedding.from_word2vec(path.join(dirpath, "tmp.bin"), binary=True)
    w3 = Embedding.from_word2vec(path.join(dirpath, "tmp.txt"), binary=False)
    assert np.array_equal(w.vectors, w2.vectors)
    assert w.vocabulary.words == w2.vocabulary.words
    assert np.sum(np.abs(w.vectors - w3.vectors)) < 1e-5
    assert w.vocabulary.words == w3.vocabulary.words

示例#11

0

显示文件

def fetch_NMT(which="DE", normalize=True, lower=False, clean_words=False):
    """
    Fetches word embeddings induced by Neural Translation Machine

    Parameters
    ----------
    which: str, default: "DE"
      Can choose between DE and FR, which fetches accordingly EN -> DE or EN -> FR translation
      induced word embeddings

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Published at https://www.cl.cam.ac.uk/~fh295/.
    Reference paper: Hill, Cho et al., "Embedding Word Similarity With Neural Machine Translation", 2014
    """
    dirname = _fetch_file(url="https://www.cl.cam.ac.uk/~fh295/TEmbz.tar.gz",
                          data_dir="embeddings",
                          uncompress=True,
                          verbose=1)

    assert which in ["DE", "FR"], "Unrecognized which parameter"

    fname = {
        "FR": "Trans_embds/D_RNN_500k_144h.pkl",
        "DE": "Trans_embds/D_german_50k_500k_168h.pkl"
    }

    return load_embedding(path.join(dirname, fname[which]),
                          format="dict",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)

示例#12

0

显示文件

文件： test_embedding.py 项目： avsilva/sparse-nlp

def test_standardize():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    w2 = w.standardize_words(inplace=False, lower=False, clean_words=True)
    w3 = Embedding.from_word2vec(file_name, binary=True)
    assert len(w2.words) == 95
    for word in w.vocabulary.words:
        if standardize_string(word, lower=False, clean_words=True):
            assert np.array_equal(w[word], w2[standardize_string(word, lower=False, clean_words=True)])

    w3.standardize_words(inplace=True, clean_words=True, lower=False)
    assert len(w3.words) == 95
    for word in w.vocabulary.words:
        if standardize_string(word, lower=False):
            assert np.array_equal(w[word], w3[standardize_string(word, lower=False, clean_words=True)])

示例#13

0

显示文件

def fetch_morphoRNNLM(which, normalize=True, lower=False, clean_words=False):
    """*
    Fetches recursive morphological neural network embeddings

    Parameters
    ----------
    which: str, default: "CW"
      Can choose between CW and HSMN

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Published at http://stanford.edu/~lmthang/morphoNLM/
    Reference paper: Luong, Socher et al., "Better Word Representations with Recursive Neural Networks for Morphology", 2013
    """
    download_file = {
        "CW": "https://www.dropbox.com/s/7fdj2666iqv4xbu/cwCsmRNN.bin.gz?dl=1",
        "HSMN":
        "https://www.dropbox.com/s/okw1i6kc6e2jd1q/hsmnCsmRNN.bin.gz?dl=1"
    }

    path = _fetch_file(url=download_file[which],
                       data_dir="embeddings",
                       uncompress=False,
                       verbose=1)

    return load_embedding(path,
                          format="word2vec_bin",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)

示例#14

0

显示文件

def test_standardize():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    w2 = w.standardize_words(inplace=False, lower=False, clean_words=True)
    w3 = Embedding.from_word2vec(file_name, binary=True)
    assert len(w2.words) == 95
    for word in w.vocabulary.words:
        if standardize_string(word, lower=False, clean_words=True):
            assert np.array_equal(w[word], w2[standardize_string(word, lower=False, clean_words=True)])

    w3.standardize_words(inplace=True, clean_words=True, lower=False)
    assert len(w3.words) == 95
    for word in w.vocabulary.words:
        if standardize_string(word, lower=False):
            assert np.array_equal(w[word], w3[standardize_string(word, lower=False, clean_words=True)])

示例#15

0

显示文件

def fetch_HPCA(which, normalize=True, lower=False, clean_words=False):  ##
    """*
    Fetches Hellinger PCA based embeddings

    Parameters
    ----------
    which: str, default: "autoencoder_phrase_hpca"
      Can choose between "hpca" and "autoencoder_phrase_hpca" (from "The Sum of Its Parts")

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Published at http://lebret.ch/words/
    Reference paper: Lebret, Collobert et al., “The Sum of Its Parts”: Joint Learning of Word and Phrase Representations with Autoencoders", 2015
    """
    download_file = {
        "autoencoder_phrase_hpca":
        "https://www.dropbox.com/s/6dyf48crdmjbw1a/AHPCA.bin.gz?dl=1",
        "hpca": "http://lebret.ch/words/embeddings/200/words.txt"
    }

    path = _fetch_file(url=download_file[which],
                       data_dir="embeddings",
                       uncompress=False,
                       verbose=1)

    return load_embedding(path,
                          format="word2vec_bin",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)

示例#16

0

显示文件

def test_analogy_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    data = fetch_google_analogy()
    ids = np.random.RandomState(777).choice(range(data.X.shape[0]),
                                            1000,
                                            replace=False)
    X, y = data.X[ids], data.y[ids]
    category = data.category_high_level[ids]

    results = evaluate_analogy(w=w, X=X, y=y, category=category)
    assert results['accuracy']['all'] >= 0.65
    assert results['accuracy']['semantic'] >= 0.7
    assert results['accuracy']['syntactic'] >= 0.63

    results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul")
    assert results['accuracy']['all'] >= 0.7
    assert results['accuracy']['semantic'] >= 0.75
    assert results['accuracy']['syntactic'] >= 0.64

    results_mul = evaluate_analogy(w=w,
                                   X=X,
                                   y=y,
                                   category=category,
                                   method="mul",
                                   k=400)
    results_add = evaluate_analogy(w=w,
                                   X=X,
                                   y=y,
                                   category=category,
                                   method="add",
                                   k=400)
    assert results_mul['accuracy']['all'] >= results_add['accuracy']['all']
    assert results_mul['accuracy']['syntactic'] >= results_add['accuracy'][
        'syntactic']
    assert results_mul['accuracy']['semantic'] >= results_add['accuracy'][
        'semantic']

示例#17

0

显示文件

def fetch_SG_GoogleNews(normalize=True, lower=False, clean_words=False):
    """
    Fetches SG (skip-gram with negative sampling)
    embeddings trained on GoogleNews dataset published on word2vec website

    Parameters
    ----------
    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Instance of Embedding class

    References
    ----------
    Original source: https://code.google.com/p/word2vec/
    """
    path = _fetch_file(
        url=
        "https://www.dropbox.com/s/bnm0trligffakd9/GoogleNews-vectors-negative300.bin.gz?dl=1",
        data_dir="embeddings",
        verbose=1)
    return load_embedding(path,
                          format="word2vec_bin",
                          normalize=normalize,
                          lower=lower,
                          clean_words=clean_words)

示例#18

0

显示文件

文件： test_categorization.py 项目： avsilva/sparse-nlp

def test_categorization():
    data = fetch_ESSLI_2c()
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    assert evaluate_categorization(w, data.X, data.y, seed=777, method="all") >= 0.2

示例#19

0

显示文件

def test_wordrep_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    P = evaluate_on_WordRep(w, max_pairs=2)
    assert P['accuracy']['all'] >= 0

示例#20

0

显示文件

def test_semeval_solver():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    results = evaluate_on_semeval_2012_2(w)
    assert results['all'] >= 0, "Should have some results on SemEval2012"

示例#21

0

显示文件

文件： test_analogy.py 项目： avsilva/sparse-nlp

def test_wordrep_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    P = evaluate_on_WordRep(w, max_pairs=2)
    assert P['accuracy']['all'] >= 0

示例#22

0

显示文件

文件： test_analogy.py 项目： avsilva/sparse-nlp

def test_semeval_solver():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    results = evaluate_on_semeval_2012_2(w)
    assert results['all'] >= 0, "Should have some results on SemEval2012"

示例#23

0

显示文件

def fetch_GloVe(dim=300,
                corpus="wiki-6B",
                normalize=True,
                lower=False,
                clean_words=False):
    """1
    Fetches GloVe embeddings.

    Parameters
    ----------
    dim: int, default: 300
      Dimensionality of embedding (usually performance increases with dimensionality).
      Available dimensionalities:
        * wiki-6B: 50, 100, 200, 300
        * common-crawl-42B: 300
        * common-crawl-840B: 300
        * twitter: 25, 50, 100, 200

    corpus: string, default: "wiki-6B"
      Corpus that GloVe vector were trained on.
      Available corpuses: "wiki-6B", "common-crawl-42B", "common-crawl-840B", "twitter-27B"

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.

    Returns
    -------
    w: Embedding
      Embedding instance

    References
    ----------
    Project website: http://nlp.stanford.edu/projects/glove/

    Notes
    -----
    Loading GloVe format can take a while
    """
    download_file = {
        "wiki-6B": "http://nlp.stanford.edu/data/glove.6B.zip",
        "common-crawl-42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip",
        "common-crawl-840B":
        "http://nlp.stanford.edu/data/glove.840B.300d.zip",
        "twitter-27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip"
    }

    embedding_file = {
        "wiki-6B": {
            50: "glove.6B/glove.6B.50d.txt",
            100: "glove.6B/glove.6B.100d.txt",
            200: "glove.6B/glove.6B.200d.txt",
            300: "glove.6B/glove.6B.300d.txt"
        },
        "common-crawl-42B": {
            300: "glove.42B.300d/glove.42B.300d.txt"
        },
        "common-crawl-840B": {
            300: "glove.840B.300d/glove.840B.300d.txt"
        },
        "twitter-27B": {
            25: "glove.twitter.27B/glove.twitter.27B.25d.txt",
            50: "glove.twitter.27B/glove.twitter.27B.50d.txt",
            100: "glove.twitter.27B/glove.twitter.27B.100d.txt",
            200: "glove.twitter.27B/glove.twitter.27B.200d.txt",
        }
    }

    vocab_size = {
        "wiki-6B": 400000,
        "common-crawl-42B": 1917494,
        "common-crawl-840B": 2196017,
        "twitter-27B": 1193514
    }

    assert corpus in download_file, "Unrecognized corpus"
    assert dim in embedding_file[corpus], "Not available dimensionality"

    _ = _fetch_file(url=download_file[corpus],
                    data_dir="embeddings",
                    uncompress=True,
                    verbose=1)

    return load_embedding(path.join(_get_dataset_dir("embeddings"), embedding_file[corpus][dim]),
                           format="glove",
                           normalize=normalize,
                           lower=lower, clean_words=clean_words,\
                           load_kwargs={"vocab_size": vocab_size[corpus], "dim": dim})