示例#1
0
def load_word_vectors(word_vectors_name,
                      embedding_size,
                      word_vectors_cache='../data/word_vectors_cache'):

    implemented_vector_embeddings = ('GloVe_6B', 'GloVe_42B', 'GloVe_840B',
                                     'GloVe_twitter.27B', 'FastText_en')
    assert word_vectors_name in implemented_vector_embeddings

    word_vectors = None

    if word_vectors_name == 'GloVe_6B':
        assert embedding_size in (50, 100, 200, 300)
        word_vectors = GloVe(name='6B',
                             dim=embedding_size,
                             cache=word_vectors_cache)

    if word_vectors_name == 'GloVe_42B':
        embedding_size = 300
        word_vectors = GloVe(name='42B', cache=word_vectors_cache)

    if word_vectors_name == 'GloVe_840B':
        embedding_size = 300
        word_vectors = GloVe(name='840B', cache=word_vectors_cache)

    if word_vectors_name == 'GloVe_twitter.27B':
        assert embedding_size in (25, 50, 100, 200)
        word_vectors = GloVe(name='twitter.27B',
                             dim=embedding_size,
                             cache=word_vectors_cache)

    if word_vectors_name == 'FastText_en':
        embedding_size = 300
        word_vectors = FastText(language='en', cache=word_vectors_cache)

    return word_vectors, embedding_size
 def load_embedding(self):
     word2id = self.opt.word2id
     logging.info('Load embedding from pytorch-nlp.')
     if self.opt.embedding_cache:
         embedding_dict = GloVe(cache=self.opt.embedding_cache)  # load embedding cache from a specific place
     else:
         embedding_dict = GloVe()  # load embedding cache from local dir or download now
     logging.info('Load embedding finished.')
     self.embedding_layer.weight.data.uniform_(-0.25, 0.25)
     for word, idx in word2id.items():
         if word in embedding_dict.stoi:
             self.embedding_layer.weight.data[idx] = embedding_dict[word]
     logging.info('Word embedding size: {0}'.format(self.embedding_layer.weight.data.size()))
def get_input_embeddings(tokens_seq, tokens_vocab):

    vocab_size = tokens_vocab.__len__()

    print("Get pretrained embeddings")
    embeddings_vectors = {}
    for name_, size_ in [("6B", 100), ("840B", 300)]:
        glove_vectors = GloVe(
            name=name_,
            dim=size_,
            cache="/home/[email protected]/resources/embeddings/glove")

        vocab_vectors = np.zeros((vocab_size, size_))

        # if token is OOV (not in embeddings = words_vector) get ppmi
        for token, id_ in tokens_vocab.vocab.items():
            if not (vocab_vectors[id_, :] == 0).all():
                continue
            else:
                vector = glove_vectors[token]
                if not (vector == 0).all():
                    vocab_vectors[id_, :] = vector
                else:
                    vocab_vectors[id_, :] = [
                        random.uniform(-1.5, 1.7) for _ in range(size_)
                    ]

        vocab_vectors = torch.tensor(vocab_vectors)
        embeddings_vectors[(name_, size_)] = vocab_vectors

    print("Finished")
    return embeddings_vectors
示例#4
0
def form_dataset(create_new, path_full, path_processed, need_hist, use_glove: bool, data_range=0):
    # Preprocess data if needed, else open processed file
    if create_new:
        all_quotes, vocabulary,\
        word_count, total_word_count,\
        end_token, quote_count = dp().preprocess(path_data_full=path_full,
                                                 path_data_processed=path_processed, data_range=data_range)
    else:
        all_quotes, vocabulary, \
        word_count, total_word_count, \
        end_token, quote_count = dp().open_preprocessed(path_data_processed=path_processed)

    # If need histograms, draw them:
    if need_hist:
        util.draw_histograms(all_quotes, vocabulary, word_count)

    # If using GloVe, then use this:
    if use_glove:
        # Create datasets and prepare embeddings:
        glove = GloVe('6B')
        # Get embeddings
        embeddings = []
        # Append 1st embedding as pad embedding for intuition later onwards
        #embeddings.append(torch.zeros_like(glove['word']))
        for word in word_count.keys():
            embeddings.append(glove[word])
    else:
        embeddings = None

    all_quotes = util.words_to_label(all_quotes, vocabulary)
    x_data = all_quotes[:int(len(all_quotes) * 0.8)]
    y_data = all_quotes[int(len(all_quotes) * 0.8):]
    dataset_train = QuoteDataset(x_data, end_token)
    dataset_test = QuoteDataset(y_data, end_token)
    return dataset_train, dataset_test, vocabulary, word_count, total_word_count, end_token, quote_count, embeddings
    def __init__(
        self,
        vocab=None,
        name='840B',
        dim=300,
        trainable=False,
    ):

        super(GloveEmbedding, self).__init__()

        self.vocab_size = len(vocab)
        self.vocab = vocab
        self.name = name
        self.dim = dim

        vectors = GloVe(name=self.name, dim=self.dim)
        self.weights = torch.zeros(self.vocab_size, vectors.dim)
        for idx in range(self.vocab_size):
            self.weights[idx, :] = vectors[self.vocab[idx]]

        self.embedding = nn.Embedding(self.vocab_size, self.dim)
        self.embedding.weight.data = torch.Tensor(self.weights)

        if not trainable:
            self.embedding.weight.requires_grad = False
示例#6
0
 def load_pretrained_embeddings(self, dim):
     vectors = GloVe(name="6B", dim=dim)
     embeddings = torch.stack([
         vectors[self.id_to_word[ind]]
         for ind in tqdm(range(len(self.id_to_word)))
     ])
     embeddings = embeddings.type(torch.float64)
     return embeddings
def word_vec(list_idx):

    list_word_vec=[]
    vec=GloVe(name='840B',dim=300)
    for i,w in enumerate(list_idx):
        list_word_vec.append(vec[w])
    sentence_word_vec= torch.stack(list_word_vec)
    #print (sentence_word_vec.shape)
    return sentence_word_vec
示例#8
0
def convert_to_vector_representation(data):
    glv = GloVe()
    vectorized_data = []
    for document, y in data:
        vector = []
        for word in document:
            word_embed = glv[word]
            vector.append(word_embed)
        vectorized_data.append((vector, y))
    return vectorized_data
示例#9
0
def load_embedding(opt, word2id):
    if opt.word_embedding:
        logging.info('Load embedding from file.')
        raise NotImplementedError
    else:
        logging.info('Load embedding from pytorch-nlp.')
        if opt.embedding_cache:
            embedding_dict = GloVe(cache=opt.embedding_cache) # load embedding cache from a specific place
        else:
            embedding_dict = GloVe() # load embedding cache from local dir for download now
        logging.info('Load embedding finished.')
        pad_id = word2id['<pad>']
        n_v = len(word2id)
        n_d = opt.word_dim
        embedding_layer = nn.Embedding(n_v, n_d, padding_idx=pad_id)
        embedding_layer.weight.data.uniform_(-0.25, 0.25)
        for word, idx in word2id.items():
            if word in embedding_dict.stoi:
                embedding_layer.weight.data[idx] = embedding_dict[word]
        logging.info('Word embedding size: {0}'.format(embedding_layer.weight.data.size()))
    return embedding_layer
示例#10
0
    def get_w2v_input(self):
        from torchnlp.word_to_vector import GloVe
        vectors = GloVe()
        indexeds = []

        for idx in self.batch_index:
            text = self.raw_input[idx]
            pad_num = self.max_len - len(text)
            indexeds.append(
                torch.cat((vectors[text], torch.zeros([pad_num, 300])), 0))

        return torch.stack(indexeds)
示例#11
0
def make_embedding(vocab, d):
    '''craete and save a (|V|xd) embedding matrix'''
    vectors = GloVe()
    dim = vectors['hi'].shape[0]
    embeddings = []
    for word, _ in vocab.items():
        if word == "PAD":
            vec = torch.zeros(dim).float()
        else:
            vec = vectors[word]
        embeddings.append(vec)
    embd = torch.stack(embeddings)
    torch.save(embd, '%s/embd.pt' % d)
示例#12
0
def test_glove_6b_50(mock_urlretrieve):
    directory = 'tests/_test_data/glove/'

    # Make sure URL has a 200 status
    mock_urlretrieve.side_effect = urlretrieve_side_effect

    # Attempt to parse a subset of GloVe
    vectors = GloVe(name="6B", dim="50", cache=directory)
    assert len(vectors['the']) == 50

    # Test with the unknown characters
    assert len(vectors['漢字']) == 50

    # Clean up
    os.remove(directory + 'glove.6B.50d.txt.pt')
示例#13
0
 def load_w2v_models(self):
     original_path =  os.getcwd()
     # os.chdir(self.cf.folder_of_data + 'all_data/') # this is where the data models are stored
     os.chdir(self.cf.folder_of_data) # this is where the data models are stored
     
     print('loading google w2v')
     IMDB_dataset.google_model = gensim.models.KeyedVectors.load_word2vec_format(self.cf.folder_of_data + 
                                                             '/models/GoogleNews-vectors-negative300.bin', binary=True)      
     print('loading fasttext w2v')
     IMDB_dataset.fasttext_model = FastText(language="simple")
     print('loading glove w2v')
     IMDB_dataset.glove_model = GloVe(name='6B', dim=300)
     print('building/loading custom w2v')
     IMDB_dataset.custom_model = build_w2v_model(self.cf, self.data)   
     
     os.chdir(original_path) # restore the original path
示例#14
0
def get_lstm_parsed_sentences_and_embeddings(data):
    processed_data = ProcessDataset(data, max_vocab_size=len(data))
    words_counter = processed_data.build_counter()
    vocab, index_to_vocab = processed_data.build_vocab(
        words_counter=words_counter, max_vocab_size=len(data))
    sentences = np.array([
        pad_features(processed_data.__getitem__(i)[0])
        for i in range(len(data))
    ])
    pretrained_embedding = GloVe(name='6B',
                                 dim=300,
                                 is_include=lambda w: w in vocab.keys())
    embedding_weights = torch.Tensor(len(vocab.keys()),
                                     pretrained_embedding.dim)
    for num, word in index_to_vocab.items():
        embedding_weights[num] = pretrained_embedding[index_to_vocab[num]]
    return sentences, embedding_weights
示例#15
0
 def __init__(self, n_vocab, n_embed, n_hidden, n_layer, vocab=None):
     super(KnowledgeEncoder, self).__init__()
     self.n_vocab = n_vocab
     self.n_embed = n_embed
     self.n_hidden = n_hidden
     self.n_layer = n_layer
     if vocab is None:
         self.embedding = nn.Embedding(n_vocab, n_embed)
     else:
         embedding = torch.Tensor(n_vocab, n_embed)
         vectors = GloVe()
         for word in vocab.stoi:
             if word in vectors.stoi:
                 embedding[vocab.stoi[word]] = vectors[word]
         self.embedding = nn.Embedding.from_pretrained(embedding)
         print("Kencoder embedding is initialized with Glove")
     self.gru = nn.GRU(input_size=n_embed, hidden_size=n_hidden,
                       num_layers=n_layer, bidirectional=True)
示例#16
0
def get_comment_embed(
        string,
        glove_embedding=None,
        corpus_vocab_prob_file='pandas_objects/corpus_vocab_prob.pkl'):
    """
    
    Parameters
    ----------
    string : str
        Comment associated (or not) with image.
    corpus_vocab_prob_file : str, optional
        File location of corpus vocab probability pickle file.
        The default is 'pandas_objects/corpus_vocab_prob.pkl'.

    Returns
    -------
    comment_embedding : torch.Tensor
        [1, 100]

    """

    string_list = preprocess_comments(string, input_type='string').split(" ")
    string_list = list(filter(lambda x: x != "", string_list))

    if glove_embedding == None:
        glove_embedding = GloVe(name="6B",
                                dim=100,
                                is_include=lambda w: w in set(string_list))

    corpus_vocab_prob = pd.read_pickle(corpus_vocab_prob_file)

    comment_embedding = torch.zeros([100])  # Summary vector

    for word in string_list:
        word_embedding = glove_embedding[word]
        try:
            word_prob = corpus_vocab_prob[word]
            comment_embedding = comment_embedding + (1e-3 /
                                                     (1e-3 + word_prob) *
                                                     word_embedding)
        except:
            print('Word not in Flickr Corpus. WORD: ', word)

    return comment_embedding
示例#17
0
def main(vocab_file):

    GloVe_vectors = GloVe(name='6B', dim=cfg.EMBEDDING_SIZE)

    embeddings = torch.Tensor(cfg.VOCAB_SIZE + 3, 100)
    word2idx, idx2word = {}, {}

    word2idx[cfg.SENTENCE_START] = cfg.VOCAB_SIZE
    word2idx[cfg.SENTENCE_END] = cfg.VOCAB_SIZE + 1
    word2idx[cfg.UNKNOWN] = cfg.VOCAB_SIZE + 2

    idx2word[cfg.VOCAB_SIZE] = cfg.SENTENCE_START
    idx2word[cfg.VOCAB_SIZE + 1] = cfg.SENTENCE_END
    idx2word[cfg.VOCAB_SIZE + 2] = cfg.UNKNOWN


    with open(vocab_file, 'r') as reader:

        for i, line in enumerate(reader):

            token, count = line.split(' ')

            embeddings[i] = GloVe_vectors[token]
            word2idx[token] = i
            idx2word[i] = token

    # Start and end tokens
    embeddings[-3] = torch.cat((torch.zeros(cfg.EMBEDDING_SIZE // 2),
                                torch.ones(cfg.EMBEDDING_SIZE // 2)), 0)
    embeddings[-2] = torch.cat((torch.ones(cfg.EMBEDDING_SIZE // 2),
                                torch.zeros(cfg.EMBEDDING_SIZE // 2)), 0)
    embeddings[-1] = torch.zeros(cfg.EMBEDDING_SIZE)

    torch.save(embeddings, 'GloVe_embeddings.pt')
    word2idx = json.dumps(word2idx)
    idx2word = json.dumps(idx2word)

    with open("word2idx.json", 'w') as json_writer:
        json_writer.write(word2idx)

    with open("idx2word.json", 'w') as json_writer:
        json_writer.write(idx2word)
    def __init__(self, vocab=None, name="840B", dim=100, trainable=False):

        super(GloveEmbedding, self).__init__()

        self.vocab_size = len(vocab)
        self.vocab = vocab
        self.name = name
        self.dim = dim
        # pdb.set_trace()
        vectors = GloVe(name=self.name, dim=self.dim)
        self.weights = torch.zeros(self.vocab_size, vectors.dim)

        for i, idx in enumerate(list(self.vocab.idx2word.keys())):
            self.weights[i, :] = vectors[self.vocab[idx]]

        self.embedding = nn.Embedding(self.vocab_size, self.dim)
        self.embedding.weight.data = torch.Tensor(self.weights)

        if not trainable:
            self.embedding.weight.requires_grad = False
示例#19
0
def get_word_vec_model(cf, reviews):
    print('--- Loading', cf.word_corpus_4_text_understanding, " pre-trained Word2Vec model")
    if  cf.word_corpus_4_text_understanding == 'Google_news' :        
        word_vec_model = gensim.models.KeyedVectors.load_word2vec_format(cf.folder_of_data + 
                                                                    '/models/GoogleNews-vectors-negative300.bin', binary=True)      
        
    elif cf.word_corpus_4_text_understanding=='Brown':        
        word_vec_model = gensim.models.Word2Vec(brown.sents()) # using Brown corpus
    elif cf.word_corpus_4_text_understanding == 'Fasttext':
        word_vec_model = FastText(language="simple")
    elif cf.word_corpus_4_text_understanding == 'Glove':
        word_vec_model = GloVe(name='6B', dim=300) 
    elif cf.word_corpus_4_text_understanding == 'CharNGram':
        word_vec_model = CharNGram()
    elif cf.word_corpus_4_text_understanding == 'Custom':
        word_vec_model = build_w2v_model(cf, reviews)        
    else:
        print('Error: Please select a word vector model')
    print('--- Loading', cf.word_corpus_4_text_understanding, ' done')
    
    return word_vec_model # , custom_model
def get_torch_glove(torch_glove_type):
    """A helper function to user torchnlp built-in glove getter.

    :param torch_glove_type: a string, name of GloVe embedding
    :return: bool, whether the get operation workd.
    """
    logging.info(
        f'Downloading GloVe vectors from TorchNLP for {torch_glove_type}')
    # set path for download (cache in torchnlp)
    period = "."
    underscore = "_"
    if period in torch_glove_type:
        torch_glove_path = torch_glove_type.replace(period, underscore)
    else:
        torch_glove_path = torch_glove_type

    torch_glove_folder = os.sep.join(
        [EMBEDDING_FOLDER, f'torch_glove_{torch_glove_path}'])
    # run torchnlp method for GloVe download
    directories = []

    if os.path.exists(torch_glove_folder):
        directories = os.listdir(torch_glove_folder)

    if len(directories) == 0:
        GloVe(name=torch_glove_type, cache=torch_glove_folder)
        directories = os.listdir(torch_glove_folder)

    if len(directories) > 0:
        directories = [
            x for x in directories
            if not x.endswith('.pt') and not x.endswith('.zip')
        ]
        write_pickle(directories, torch_glove_folder)

    if directories:
        return True
    else:
        return False
示例#21
0
 def __init__(self, n_vocab, n_embed, n_hidden, n_layer, vocab=None):
     super(Decoder, self).__init__()
     self.n_vocab = n_vocab
     self.n_embed = n_embed
     self.n_hidden = n_hidden
     self.n_layer = n_layer
     if vocab is None:
         self.embedding = nn.Embedding(n_vocab, n_embed)
     else:
         embedding = torch.Tensor(n_vocab, n_embed)
         vectors = GloVe()
         for word in vocab.stoi:
             if word in vectors.stoi:
                 embedding[vocab.stoi[word]] = vectors[word]
         self.embedding = nn.Embedding.from_pretrained(embedding)
         print("decoder embedding is initialized with Glove")
     self.attention = Attention(n_hidden)
     self.y_weight = nn.Linear(n_hidden, n_hidden)
     self.k_weight = nn.Linear(n_hidden, n_hidden)
     self.z_weight = nn.Linear(2 * n_hidden, n_hidden)
     self.y_gru = nn.GRU(n_embed + n_hidden, n_hidden, n_layer)
     self.k_gru = nn.GRU(3 * n_hidden, n_hidden, n_layer)
     self.out = nn.Linear(2 * n_hidden, n_vocab)
示例#22
0
    def __init__(self,
                 vocab,
                 sentences,
                 model="elmo",
                 attention="vanilla",
                 k=0):
        super().__init__()
        self.name = f"model={model}_attention={attention}_k={k}"
        self.k = k
        self.bert = model == "bert"

        if self.bert:
            self.bert_model = BertModel.from_pretrained('bert-base-uncased')
            self.bert_model.train()
            hidden_size = 768
            double = hidden_size
            self.dropout_on_input_to_linear_layer = nn.Dropout(0.1)
        else:
            hidden_size = 128
            self.cached = dict()
            self.init_elmo(vocab, sentences)
            self.glove = GloVe()

            self.lstm1 = nn.LSTM(input_size=1324,
                                 hidden_size=hidden_size,
                                 batch_first=True,
                                 bidirectional=True)
            double = hidden_size * 2
            self.dropout_on_input_to_LSTM = nn.Dropout(0.5)
            self.dropout_on_input_to_linear_layer = nn.Dropout(0.3)

        # Initialise parameters for classification & attention layers
        self.output_projection = nn.Linear(double * (2 if k > -1 else 1), 2)
        if attention == "general":
            self.attention = GeneralAttention(double)
        else:
            self.attention = HierarchicalAttention(double, self.bert)
示例#23
0
amazon_test_csv = pd.read_csv(test_csv, header=None,
                              names=['sentiment', 'title', 'review'])
test_data = get_data(amazon_test_csv)
test_texts = retrieve_texts(test_data)
_, test_indices, _ = docs2idxs(test_texts, max_len=training_seq_len, encoder=enc)
test_indices_and_sentiment = [(idxs, d[0]) for (idxs, d) in zip(test_indices, test_data)]

training_titles = set(amazon_training_csv['title'])
training_reviews = set(amazon_training_csv['review'])
for idx, item in amazon_test_csv.iterrows():
    if item['review'] in training_reviews:
        if item['title'] in training_titles:
            raise AssertionError("Row w/ title {} redundant.".format(item['title']))

vecs = GloVe(cache=config.get('PREPARATION', 'word_vector_cache'))
embedding_weights = weights(enc, vecs)

embedding_model = Sequential()
embedding_model.add(Embedding(enc.vocab_size,
                              vecs.dim,
                              weights=[embedding_weights],
                              input_length=training_seq_len,
                              trainable=False))
embedding_model.compile('rmsprop', 'mse')

input_shape = (training_seq_len, vecs.dim, 1)

x_train_unshaped = [embedding_model.predict(np.array(sample[0]).reshape(1, -1)) for sample in
                    training_indices_and_sentiment]  # shape n * (1 * seq_len * vector_dim)
x_test_unshaped = [embedding_model.predict(np.array(sample[0]).reshape(1, -1)) for sample in
示例#24
0
文件: glove.py 项目: dedoogong/mmerc
import torch
from torchnlp.encoders.text import WhitespaceEncoder
from torchnlp.word_to_vector import GloVe

encoder = WhitespaceEncoder(["now this ain't funny", "so don't you dare laugh"])

vocab = set(encoder.vocab)
pretrained_embedding = GloVe(name='6B', dim=300, is_include=lambda w: w in vocab)
embedding_weights = torch.Tensor(encoder.vocab_size, pretrained_embedding.dim)
for i, token in enumerate(encoder.vocab):
    embedding_weights[i] = pretrained_embedding[token]
print("")
#!/usr/bin/env python
# encoding: utf-8
"""
@author: HuRuiFeng
@file: lesson85-time-series-expression.py
@time: 2020/7/24 14:13
@project: deeplearning-with-pytorch-notes
@desc: 第85课:时间序列表示
"""

import torch
from torch import nn
import torchnlp

from torchnlp import word_to_vector
from torchnlp.word_to_vector import GloVe

print("-----word2vec vs GloVe-----")
print("====word2vec=====")
word_to_ix = {"hello": 0, "world": 1}
lookup_tensor = torch.tensor([word_to_ix['hello']], dtype=torch.long)

embeds = nn.Embedding(2, 5)
hello_embed = embeds(lookup_tensor)
print(hello_embed)
print()

print("====GloVe=====")
vectors = GloVe()
print(vectors['hello'])
示例#26
0
 def __init__(self):
     self.stop_words = stopwords.words("english")
     self.glove_vectors = GloVe(name='6B')
     self.lemma = WordNetLemmatizer()
     self.token_index = {}
     return
示例#27
0
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from torchnlp.word_to_vector import GloVe

import numpy as np

glove_embedding = GloVe(
    cache=
    "../../Twitter_Ideology_Prediction/data/post_processing/.word_vectors_cache"
)

# print(vars(glove_embedding).keys())
# print(dir(glove_embedding))
# print(type(glove_embedding.stoi))

vocabulary = list(glove_embedding.stoi.keys())
dictionary = dict(zip(list(range(len(vocabulary))), vocabulary))
word2idx = glove_embedding.stoi  # reversed_dict
'''
print(len(vocabulary))
print(word2idx["trump"])
print(word2idx["Trump"])
print(word2idx["obama"])
print(word2idx["Obama"])
print(word2idx["democratic"])
print(word2idx["Democratic"])
print(word2idx["republican"])
print(word2idx["Republican"])
'''

# n = 1000
示例#28
0
            comment_embedding = comment_embedding + (1e-3 /
                                                     (1e-3 + word_prob) *
                                                     word_embedding)
        except:
            print('Word not in Flickr Corpus. WORD: ', word)

    return comment_embedding


if __name__ == "__main__":

    for string in strings:
        similarities = []
        string_list = preprocess_comments(string,
                                          input_type='string').split(" ")
        string_list = list(filter(lambda x: x != "", string_list))
        glove_embedding = GloVe(name="6B",
                                dim=100,
                                is_include=lambda w: w in set(string_list))

        for img_name in img_names:
            similarities.append(
                [img_name, main(string, img_name, glove_embedding)])

        similarities.sort(key=lambda x: x[1])
        similarities.reverse()

        print('From most similar to least similar...')
        for i in range(1, len(similarities) + 1):
            print('IMG: ', similarities[i - 1], 'IMG RANK:', i)
示例#29
0
    def _build_dataloader(self):
        self.val_loader = self.corpus = None
        if self.dataset_kind == "mnist":
            transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ])
            self.dataset = MNISTBufferedDataset(self.data_dir,
                                                download=True,
                                                train=True,
                                                transform=transform)
            self.val_dataset = MNISTBufferedDataset(self.data_dir,
                                                    download=True,
                                                    transform=transform)

            self.train_sampler = MNISTSequenceSampler(
                self.dataset,
                sequences=self.sequences,
                batch_size=self.batch_size,
                random_mnist_images=not self.static_digit,
                randomize_sequence_cursors=self.randomize_sequence_cursors,
                noise_buffer=self.noise_buffer,
                use_mnist_pct=self.use_mnist_pct,
                max_batches=self.batches_in_epoch,
            )

            if self.static_digit:
                # For static digit paradigm, val & train samplers much
                # match to ensure same digit prototype used for each sequence item.
                self.val_sampler = self.train_sampler
            else:
                self.val_sampler = MNISTSequenceSampler(
                    self.val_dataset,
                    sequences=self.sequences,
                    batch_size=self.batch_size,
                    random_mnist_images=not self.static_digit,
                    randomize_sequence_cursors=self.randomize_sequence_cursors,
                    noise_buffer=self.noise_buffer,
                    use_mnist_pct=self.use_mnist_pct,
                    max_batches=self.eval_batches_in_epoch,
                )
            self.train_loader = DataLoader(
                self.dataset,
                batch_sampler=self.train_sampler,
                collate_fn=pred_sequence_collate,
            )
            self.val_loader = DataLoader(
                self.val_dataset,
                batch_sampler=self.val_sampler,
                collate_fn=pred_sequence_collate,
            )

        elif self.dataset_kind == "ptb":
            # Download "Penn Treebank" dataset
            from torchnlp.datasets import penn_treebank_dataset

            print("Maybe download PTB...")
            penn_treebank_dataset(self.data_dir + "/PTB",
                                  train=True,
                                  test=True)
            corpus = lang_util.Corpus(self.data_dir + "/PTB")
            train_sampler = PTBSequenceSampler(
                corpus.train,
                batch_size=self.batch_size,
                max_batches=self.batches_in_epoch,
            )

            if self.embedding_kind == "rsm_bitwise":
                embedding = lang_util.BitwiseWordEmbedding().embedding_dict
            elif self.embedding_kind in ["bpe", "glove"]:
                from torchnlp.word_to_vector import BPEmb, GloVe

                cache_dir = self.data_dir + "/torchnlp/.word_vectors_cache"
                if self.embedding_kind == "bpe":
                    vectors = BPEmb(dim=self.embed_dim, cache=cache_dir)
                else:
                    vectors = GloVe(name="6B",
                                    dim=self.embed_dim,
                                    cache=cache_dir)
                embedding = {}
                for word_id, word in enumerate(corpus.dictionary.idx2word):
                    embedding[word_id] = vectors[word]
            elif "ptb_fasttext" in self.embedding_kind:
                import fasttext

                # Generated via notebooks/ptb_embeddings.ipynb
                embedding = {}
                ft_model = fasttext.load_model(self.data_dir +
                                               "/embeddings/%s.bin" %
                                               self.embedding_kind)
                for word_id, word in enumerate(corpus.dictionary.idx2word):
                    embedding[word_id] = torch.tensor(ft_model[word])

            if self.embedding_kind:
                print("Loaded embedding dict (%s) with %d entries" %
                      (self.embedding_kind, len(embedding)))

            collate_fn = partial(ptb_pred_sequence_collate,
                                 vector_dict=embedding)
            self.train_loader = DataLoader(corpus.train,
                                           batch_sampler=train_sampler,
                                           collate_fn=collate_fn)
            val_sampler = PTBSequenceSampler(
                corpus.test,
                batch_size=self.eval_batch_size,
                max_batches=self.eval_batches_in_epoch,
                uniform_offsets=True,
            )
            self.val_loader = DataLoader(corpus.test,
                                         batch_sampler=val_sampler,
                                         collate_fn=collate_fn)
            self.corpus = corpus
            print("Built dataloaders...")
示例#30
0
def glove_embedding(size):
	glove = GloVe('6B', size, cache=CACHE)
	stoi = {tok: i for i, tok in enumerate(glove.itos)}
	rows, cols = glove.vectors.shape
	embedding = nn.Embedding(rows, cols, _weight=glove.vectors)
	return embedding, stoi, glove.itos