def __init__(self, hidden_size, output_size, device='cuda'): super(TestModelEmbedding, self).__init__() self.embedding = WordEmbedding() self.embedding.set_elmo() self.embedding.set_glove(GLOVE_TRAIN_FILE) self.embedding.set_bert() self._l1 = nn.Linear(1324 * 4, hidden_size) self._l2 = nn.Linear(hidden_size, output_size) self.to(device)
def setUpClass(cls): cls.pre_processing = PreProcessing(sentences) cls.dataset = ds.process(cls.pre_processing) cls.word_embedding = WordEmbedding(source=cls.dataset.pairs) encoder = EncoderRNN(cls.word_embedding, 300, 1).to(settings.device) decoder = DecoderRNN(300, cls.word_embedding, 0.0, 1).to(settings.device) cls.model = Model(encoder, decoder) cls.model.train(cls.dataset)
class BaselineElmo(nn.Module): ''' Baseline model as in WiC paper by Pilehvar & Camacho-Collados Returns hidden state of the first ELMo LSTM ''' def __init__(self, mix_parameters=[1 / 3, 1 / 3, 1 / 3]): super(BaselineElmo, self).__init__() # make embedding self.embedding = WordEmbedding() self.embedding.set_elmo(mix_parameters=mix_parameters) def forward(self, batch): return self.embed_words(batch) def embed_words(self, batch): return self.embedding(batch) def embed_sentences(self, batch): raise Exception("ELMo1 does not produce a sentence embedding")
def __init__(self, hidden_size, word_embedding: WordEmbedding, dropout_p, n_layers=1): super(DecoderRNN, self).__init__() self.hidden_size = hidden_size self.n_layers = n_layers self.dropout_p = dropout_p self.word_embedding = word_embedding self.embedding = word_embedding.embedding_layer self.dropout = nn.Dropout(self.dropout_p) self.gru = nn.GRU(hidden_size, hidden_size) self.out = nn.Linear(hidden_size, word_embedding.n_words()) self.softmax = nn.LogSoftmax(dim=1)
def retrain(): ds = process(PreProcessing('./data/starwars.txt')) word_embedding = WordEmbedding(source='./embedding/FT/fasttext_cbow_300d.bin') word_embedding.train(ds.pairs) word_embedding.save('./embedding/starwars', 'starwars.bin')
def test_load_from_file(self): embeddings_path = os.path.join(settings.BASE_DIR, 'embeddings', uuid.uuid4().hex) filename = str(self.__class__.dataset.idx) + ".bin" word_embedding = WordEmbedding(source=self.__class__.dataset.pairs) word_embedding.train() word_embedding.save(embeddings_path, filename) model = WordEmbedding(source=os.path.join(embeddings_path, filename)) print(model._embedding.wv.similarity('batendo', 'porta'))
def run(hidden, layer, dropout, learning_rate, iteration, save, train=None, test=None): if train: dataset_id = train.split('/')[-1].split('.')[0] pre_processing = PreProcessing(open(train, 'r'), dataset_id) dataset = process(pre_processing) encoder_embeddings = WordEmbedding(source=dataset.pairs) decoder_embeddings = WordEmbedding(source=dataset.pairs) encoder = EncoderRNN(encoder_embeddings, hidden, layer).to(settings.device) decoder = DecoderRNN(hidden, decoder_embeddings, dropout, layer).to(settings.device) model = Model( encoder=encoder, decoder=decoder, learning_rate=learning_rate, ) model.summary() model.train(dataset, n_iter=iteration, save_every=save) if test: dataset = load(test) model = Model.load(test) while True: decoded_words = model.evaluate(str(input("> ")), dataset) print(' '.join(decoded_words))
def train(): ds = process(PreProcessing(open('./data/starwars.txt', 'r'))) word_embedding = WordEmbedding(source=ds.pairs) word_embedding.train(ds.pairs) word_embedding.save(target_folder='./embedding/starwars', filename='starwars.bin')
def __init__(self, device, pos_classes=45, metaphor_classes=2, snli_classes=3, lstm_hidden_size=100, dropout=0, embedding_model="ELMo+GloVe"): super(JMTModel, self).__init__() if embedding_model in ["ELMo2+GloVe", "ELMo3+GloVe"]: self.embedding = WordEmbedding(device) self.embedding.set_elmo("23" if embedding_model == "ELMo2+GloVe" else "123") # self.embedding.set_bert() self.embedding.set_glove() embedding_size = 1324 else: self.embedding = BertEmbedding(embedding_model, device) embedding_size = self.embedding.embedding_size self.pos_lstm = nn.LSTM(embedding_size, lstm_hidden_size, 1, bidirectional=True, dropout=dropout, batch_first=True) self.pos_classifier = nn.Linear(2 * lstm_hidden_size, pos_classes) self.metaphor_lstm = nn.LSTM(embedding_size + 2 * lstm_hidden_size + pos_classes, lstm_hidden_size, 1, bidirectional=True, dropout=dropout, batch_first=True) self.metaphor_classifier = nn.Linear(2 * lstm_hidden_size, metaphor_classes) self.snli_lstm = nn.LSTM(embedding_size + 4 * lstm_hidden_size + metaphor_classes, lstm_hidden_size, 1, bidirectional=True, dropout=dropout, batch_first=True) self.snli_classifier = nn.Linear(2 * lstm_hidden_size * 4, snli_classes)
class TestModelEmbedding(nn.Module): ''' Model to test elmo and glove embedding Task: NLI ''' def __init__(self, hidden_size, output_size, device='cuda'): super(TestModelEmbedding, self).__init__() self.embedding = WordEmbedding() self.embedding.set_elmo() self.embedding.set_glove(GLOVE_TRAIN_FILE) self.embedding.set_bert() self._l1 = nn.Linear(1324 * 4, hidden_size) self._l2 = nn.Linear(hidden_size, output_size) self.to(device) def forward(self, X1, X2): ''' Args: X1: list of list of words from premise sentences, e.g. [['First', 'sentence', '.'], ['Another', '.']] X2: same as X1 for hypothesis sentences ''' # embedding: mean of word embeddings E1 = self.embed_sentences(X1) E2 = self.embed_sentences(X2) # Combine sentences for classification abs_diff = torch.abs(E1 - E2) elem = E1 * E2 concat = torch.cat([E1, E2, abs_diff, elem], dim=1) # Classify return self._classify(concat) def _classify(self, X): X = self._l1(X) X = relu(X) return self._l2(X) def embed_sentences(self, batch): ''' Embeds each sentence in the batch by averaging ELMo embeddings. NOTE: not used in training, only for sentence embedding evaluation Args: batch: list of list of words from premise sentences, e.g. [['First', 'sentence', '.'], ['Another', '.']] Returns: embedded: sentence embeddings. Shape (batch, features) ''' word_embed = self.embed_words(batch) return word_embed.mean(dim=1) def embed_words(self, batch): ''' Embeds each word in a batch of sentences using ELMo embeddings (contextualized) Args: batch: list of list of words from premise sentences, e.g. [['First', 'sentence', '.'], ['Another', '.']] Returns: embedded: ELMo embedding of batch, padded to make sentences of equal length. Shape (batch, sequence, features) ''' return self.embedding(batch)
def __init__(self, mix_parameters=[1 / 3, 1 / 3, 1 / 3]): super(BaselineElmo, self).__init__() # make embedding self.embedding = WordEmbedding() self.embedding.set_elmo(mix_parameters=mix_parameters)
for line in open(path)] def load_labels(path): # Loads a label for each line (-1 indicates the pairs do not form a relation). return [int(label) for label in open(path)] if __name__ == '__main__': if len(sys.argv) != 6: print "Usage ./detect_relations.py vocab_file embedding_file train_data train_labels test_data" sys.exit(0) # Load vocab and embedding (these are not used yet!) vocab = Vocab(sys.argv[1]) embedding = WordEmbedding(vocab, sys.argv[2]) model = Model(vocab, embedding) # Loads training data and labels. training_examples = load_example_sets(sys.argv[3]) training_labels = load_labels(sys.argv[4]) assert len(training_examples) == len( training_labels), "Expected one label for each line in training data." # Training the model train_diffs_means = [] # model params train_diffs_stds = [] # other model params for training_example, training_label in zip(training_examples, training_labels): diffs = [] for w1, w2 in training_example:
def test_should_generate_training_pairs(self): pre_processing = PreProcessing(sentences) dataset = ds.process(pre_processing) word_embedding = WordEmbedding(freeze=False, source=dataset.pairs) word_embedding.train() self.assertEqual(len(dataset.training_pairs(2, word_embedding)), 2)
def test_train(self): word_embedding = WordEmbedding(source=self.__class__.dataset.pairs) self.assertEqual(word_embedding.n_words(), 25)
return [int(label) for label in open(path)] def cosine(x, y): # Cosine of angle between vectors x and y return x.dot(y) / np.linalg.norm(x) / np.linalg.norm(y) if __name__ == '__main__': if len(sys.argv) != 6: print "Usage ./detect_relations.py vocab_file embedding_file train_data train_labels test_data" sys.exit(0) # Load vocab and embedding (these are not used yet!) vocab = Vocab(sys.argv[1]) embedding = WordEmbedding(vocab, sys.argv[2]) # Loads training data and labels. training_examples = load_example_sets(sys.argv[3]) training_labels = load_labels(sys.argv[4]) assert len(training_examples) == len( training_labels), "Expected one label for each line in training data." # Load test examples and labels each set of pairs as 'not a relation' (-1) # This is not a good idea... You can definitely do better! test_examples = load_example_sets(sys.argv[5]) # Store displacement between embeddings for each label in train training_displacements = {lbl: [] for lbl in training_labels} for i, lbl in enumerate(training_labels):