示例#1
0
    def load_pretrained_embeddings(self, vocabulary, path):
        File_object = open(r"results.txt", "a")
        """
        Loads GloVe vectors and initializes the embedding matrix.

        Args:
            vocabulary: `Vocabulary` object.
            path: Embedding path, e.g. "glove/glove.6B.300d.txt".
        """
        embedding_map = load_cached_embeddings(path)

        # Create embedding matrix. By default, embeddings are randomly
        # initialized from Uniform(-0.1, 0.1).
        embeddings = torch.zeros(
            (len(vocabulary), self.args.embedding_dim)).uniform_(-0.1, 0.1)

        # Initialize pre-trained embeddings.
        num_pretrained = 0
        for (i, word) in enumerate(vocabulary.words):
            if word in embedding_map:
                embeddings[i] = torch.tensor(embedding_map[word])
                num_pretrained += 1
        # Place embedding matrix on GPU.
        self.embedding.weight.data = cuda(self.args, embeddings)

        return num_pretrained
示例#2
0
    def load_pretrained_embeddings(self, vocabulary, path):
        """
        Loads GloVe vectors and initializes the embedding matrix.

        Args:
            vocabulary: `Vocabulary` object.
            path: Embedding path, e.g. "glove/glove.6B.300d.txt".
        """

        if path == 'glove/biowordvec_train.vec.bin':
            print('Using Bio2Word Embedding')
            embedding_map = KeyedVectors.load_word2vec_format(path,
                                                              binary=True,
                                                              limit=100000)
        else:
            print('Using GloVe')
            embedding_map = load_cached_embeddings(path)

        # Create embedding matrix. By default, embeddings are randomly
        # initialized from Uniform(-0.1, 0.1).
        embeddings = torch.zeros(
            (len(vocabulary), self.args.embedding_dim)).uniform_(-0.1, 0.1)

        # Initialize pre-trained embeddings.
        num_pretrained = 0
        for (i, word) in enumerate(vocabulary.words):
            if word in embedding_map:
                embeddings[i] = torch.tensor(embedding_map[word])
                num_pretrained += 1

        # Place embedding matrix on GPU.
        self.embedding.weight.data = cuda(self.args, embeddings)

        return num_pretrained
示例#3
0
文件: train.py 项目: pustar/Castor
    def load_input_data(self, dataset_root_folder, word_vectors_cache_file, \
            train_set_folder, dev_set_folder, test_set_folder, load_ext_feats=True):
        for set_folder in [test_set_folder, dev_set_folder, train_set_folder]:
            if set_folder:
                questions, sentences, labels, maxlen_q, maxlen_s, vocab = \
                    utils.read_in_dataset(dataset_root_folder, set_folder)

                self.data_splits[set_folder] = [
                    questions, sentences, labels, maxlen_q, maxlen_s
                ]

                default_ext_feats = [np.zeros(4)] * len(
                    self.data_splits[set_folder][0])
                self.data_splits[set_folder].append(default_ext_feats)

                utils.load_cached_embeddings(
                    word_vectors_cache_file, vocab, self.embeddings,
                    [] if "train" in set_folder else self.unk_term)
示例#4
0
文件: train.py 项目: HTAustin/Castor
 def load_input_data(self, dataset_root_folder, word_vectors_cache_file,
                     train_set_folder, dev_set_folder, test_set_folder):
     for set_folder in [test_set_folder, dev_set_folder, train_set_folder]:
         if set_folder:
             self.datasets[set_folder] = utils.read_in_dataset(
                 dataset_root_folder, set_folder)
             # NOTE: self.datasets[set_folder] = questions, sentences, labels, vocab, maxlen_q, maxlen_s, ext_feats
             self.embeddings[set_folder] = utils.load_cached_embeddings(
                 word_vectors_cache_file, self.datasets[set_folder][3],
                 [] if "train" in set_folder else self.unk_term)
示例#5
0
    def load_pretrained_embeddings(self, vocabulary, path, sentences):
        """
        Loads GloVe vectors and initializes the embedding matrix.

        Args:
            vocabulary: `Vocabulary` object.
            path: Embedding path, e.g. "glove/glove.6B.300d.txt".
        """

        self.vocabulary = vocabulary
        '''options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
        weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

        # Note the "1", since we want only 1 output representation for each token.
        elmo = Elmo(options_file, weight_file, 1, dropout=0)
        print ("made it here the prequel")
        sentences = sentences[0:5]
        print (sentences)
        character_ids = batch_to_ids(sentences)
        print ("made it here")
        print ("charids: " + str(character_ids))
        embeddings = elmo(character_ids)'''

        #return len(vocabulary)

        embedding_map = load_cached_embeddings(path)

        # Create embedding matrix. By default, embeddings are randomly
        # initialized from Uniform(-0.1, 0.1).
        embeddings = torch.zeros(
            (len(vocabulary), self.args.embedding_dim)).uniform_(-0.1, 0.1)

        # Initialize pre-trained embeddings.
        num_pretrained = 0
        for (i, word) in enumerate(vocabulary.words):
            if word in embedding_map:
                embeddings[i] = torch.tensor(embedding_map[word])
                num_pretrained += 1

        # Place embedding matrix on GPU.
        self.embedding.weight.data = cuda(self.args, embeddings)

        return num_pretrained