예제 #1
0
    def __init__(self, hidden_size, output_size, device='cuda'):
        super(TestModelEmbedding, self).__init__()
        self.embedding = WordEmbedding()
        self.embedding.set_elmo()
        self.embedding.set_glove(GLOVE_TRAIN_FILE)
        self.embedding.set_bert()

        self._l1 = nn.Linear(1324 * 4, hidden_size)
        self._l2 = nn.Linear(hidden_size, output_size)

        self.to(device)
예제 #2
0
파일: tests.py 프로젝트: maeda/polysemybot
    def setUpClass(cls):
        cls.pre_processing = PreProcessing(sentences)
        cls.dataset = ds.process(cls.pre_processing)
        cls.word_embedding = WordEmbedding(source=cls.dataset.pairs)

        encoder = EncoderRNN(cls.word_embedding, 300, 1).to(settings.device)
        decoder = DecoderRNN(300, cls.word_embedding, 0.0,
                             1).to(settings.device)
        cls.model = Model(encoder, decoder)
        cls.model.train(cls.dataset)
예제 #3
0
class BaselineElmo(nn.Module):
    '''
    Baseline model as in WiC paper by Pilehvar & Camacho-Collados
    Returns hidden state of the first ELMo LSTM
    '''
    def __init__(self, mix_parameters=[1 / 3, 1 / 3, 1 / 3]):
        super(BaselineElmo, self).__init__()
        # make embedding
        self.embedding = WordEmbedding()
        self.embedding.set_elmo(mix_parameters=mix_parameters)

    def forward(self, batch):
        return self.embed_words(batch)

    def embed_words(self, batch):
        return self.embedding(batch)

    def embed_sentences(self, batch):
        raise Exception("ELMo1 does not produce a sentence embedding")
예제 #4
0
파일: model.py 프로젝트: maeda/polysemybot
 def __init__(self, hidden_size, word_embedding: WordEmbedding, dropout_p, n_layers=1):
     super(DecoderRNN, self).__init__()
     self.hidden_size = hidden_size
     self.n_layers = n_layers
     self.dropout_p = dropout_p
     self.word_embedding = word_embedding
     self.embedding = word_embedding.embedding_layer
     self.dropout = nn.Dropout(self.dropout_p)
     self.gru = nn.GRU(hidden_size, hidden_size)
     self.out = nn.Linear(hidden_size, word_embedding.n_words())
     self.softmax = nn.LogSoftmax(dim=1)
예제 #5
0
def retrain():
    ds = process(PreProcessing('./data/starwars.txt'))

    word_embedding = WordEmbedding(source='./embedding/FT/fasttext_cbow_300d.bin')

    word_embedding.train(ds.pairs)
    word_embedding.save('./embedding/starwars', 'starwars.bin')
예제 #6
0
파일: tests.py 프로젝트: maeda/polysemybot
    def test_load_from_file(self):
        embeddings_path = os.path.join(settings.BASE_DIR, 'embeddings',
                                       uuid.uuid4().hex)
        filename = str(self.__class__.dataset.idx) + ".bin"

        word_embedding = WordEmbedding(source=self.__class__.dataset.pairs)
        word_embedding.train()
        word_embedding.save(embeddings_path, filename)

        model = WordEmbedding(source=os.path.join(embeddings_path, filename))
        print(model._embedding.wv.similarity('batendo', 'porta'))
예제 #7
0
파일: app.py 프로젝트: maeda/polysemybot
def run(hidden,
        layer,
        dropout,
        learning_rate,
        iteration,
        save,
        train=None,
        test=None):
    if train:
        dataset_id = train.split('/')[-1].split('.')[0]

        pre_processing = PreProcessing(open(train, 'r'), dataset_id)
        dataset = process(pre_processing)

        encoder_embeddings = WordEmbedding(source=dataset.pairs)
        decoder_embeddings = WordEmbedding(source=dataset.pairs)

        encoder = EncoderRNN(encoder_embeddings, hidden,
                             layer).to(settings.device)
        decoder = DecoderRNN(hidden, decoder_embeddings, dropout,
                             layer).to(settings.device)

        model = Model(
            encoder=encoder,
            decoder=decoder,
            learning_rate=learning_rate,
        )
        model.summary()
        model.train(dataset, n_iter=iteration, save_every=save)

    if test:

        dataset = load(test)

        model = Model.load(test)

        while True:
            decoded_words = model.evaluate(str(input("> ")), dataset)
            print(' '.join(decoded_words))
예제 #8
0
def train():
    ds = process(PreProcessing(open('./data/starwars.txt', 'r')))

    word_embedding = WordEmbedding(source=ds.pairs)

    word_embedding.train(ds.pairs)

    word_embedding.save(target_folder='./embedding/starwars', filename='starwars.bin')
예제 #9
0
    def __init__(self,
                 device,
                 pos_classes=45,
                 metaphor_classes=2,
                 snli_classes=3,
                 lstm_hidden_size=100,
                 dropout=0,
                 embedding_model="ELMo+GloVe"):
        super(JMTModel, self).__init__()

        if embedding_model in ["ELMo2+GloVe", "ELMo3+GloVe"]:
            self.embedding = WordEmbedding(device)
            self.embedding.set_elmo("23" if embedding_model ==
                                    "ELMo2+GloVe" else "123")
            # self.embedding.set_bert()
            self.embedding.set_glove()
            embedding_size = 1324
        else:
            self.embedding = BertEmbedding(embedding_model, device)
            embedding_size = self.embedding.embedding_size

        self.pos_lstm = nn.LSTM(embedding_size,
                                lstm_hidden_size,
                                1,
                                bidirectional=True,
                                dropout=dropout,
                                batch_first=True)
        self.pos_classifier = nn.Linear(2 * lstm_hidden_size, pos_classes)

        self.metaphor_lstm = nn.LSTM(embedding_size + 2 * lstm_hidden_size +
                                     pos_classes,
                                     lstm_hidden_size,
                                     1,
                                     bidirectional=True,
                                     dropout=dropout,
                                     batch_first=True)
        self.metaphor_classifier = nn.Linear(2 * lstm_hidden_size,
                                             metaphor_classes)

        self.snli_lstm = nn.LSTM(embedding_size + 4 * lstm_hidden_size +
                                 metaphor_classes,
                                 lstm_hidden_size,
                                 1,
                                 bidirectional=True,
                                 dropout=dropout,
                                 batch_first=True)
        self.snli_classifier = nn.Linear(2 * lstm_hidden_size * 4,
                                         snli_classes)
예제 #10
0
class TestModelEmbedding(nn.Module):
    '''
    Model to test elmo and glove embedding
    Task: NLI
    '''
    def __init__(self, hidden_size, output_size, device='cuda'):
        super(TestModelEmbedding, self).__init__()
        self.embedding = WordEmbedding()
        self.embedding.set_elmo()
        self.embedding.set_glove(GLOVE_TRAIN_FILE)
        self.embedding.set_bert()

        self._l1 = nn.Linear(1324 * 4, hidden_size)
        self._l2 = nn.Linear(hidden_size, output_size)

        self.to(device)

    def forward(self, X1, X2):
        '''
        Args:
            X1: list of list of words from premise sentences, e.g. [['First', 'sentence', '.'], ['Another', '.']]
            X2: same as X1 for hypothesis sentences
        '''

        # embedding: mean of word embeddings
        E1 = self.embed_sentences(X1)
        E2 = self.embed_sentences(X2)

        # Combine sentences for classification
        abs_diff = torch.abs(E1 - E2)
        elem = E1 * E2
        concat = torch.cat([E1, E2, abs_diff, elem], dim=1)

        # Classify
        return self._classify(concat)

    def _classify(self, X):
        X = self._l1(X)
        X = relu(X)
        return self._l2(X)

    def embed_sentences(self, batch):
        '''
        Embeds each sentence in the batch by averaging ELMo embeddings.
        NOTE: not used in training, only for sentence embedding evaluation
        Args:
            batch: list of list of words from premise sentences, e.g. [['First', 'sentence', '.'], ['Another', '.']]
        Returns:
            embedded: sentence embeddings. Shape (batch, features)
        '''
        word_embed = self.embed_words(batch)

        return word_embed.mean(dim=1)

    def embed_words(self, batch):
        '''
        Embeds each word in a batch of sentences using ELMo embeddings (contextualized)
        Args:
            batch: list of list of words from premise sentences, e.g. [['First', 'sentence', '.'], ['Another', '.']]
        Returns:
            embedded: ELMo embedding of batch, padded to make sentences of equal length. Shape (batch, sequence, features)
        '''
        return self.embedding(batch)
예제 #11
0
 def __init__(self, mix_parameters=[1 / 3, 1 / 3, 1 / 3]):
     super(BaselineElmo, self).__init__()
     # make embedding
     self.embedding = WordEmbedding()
     self.embedding.set_elmo(mix_parameters=mix_parameters)
예제 #12
0
            for line in open(path)]


def load_labels(path):
    # Loads a label for each line (-1 indicates the pairs do not form a relation).
    return [int(label) for label in open(path)]


if __name__ == '__main__':
    if len(sys.argv) != 6:
        print "Usage ./detect_relations.py vocab_file embedding_file train_data train_labels test_data"
        sys.exit(0)

    # Load vocab and embedding (these are not used yet!)
    vocab = Vocab(sys.argv[1])
    embedding = WordEmbedding(vocab, sys.argv[2])
    model = Model(vocab, embedding)

    # Loads training data and labels.
    training_examples = load_example_sets(sys.argv[3])
    training_labels = load_labels(sys.argv[4])
    assert len(training_examples) == len(
        training_labels), "Expected one label for each line in training data."

    # Training the model
    train_diffs_means = []  # model params
    train_diffs_stds = []  # other model params
    for training_example, training_label in zip(training_examples,
                                                training_labels):
        diffs = []
        for w1, w2 in training_example:
예제 #13
0
파일: tests.py 프로젝트: maeda/polysemybot
 def test_should_generate_training_pairs(self):
     pre_processing = PreProcessing(sentences)
     dataset = ds.process(pre_processing)
     word_embedding = WordEmbedding(freeze=False, source=dataset.pairs)
     word_embedding.train()
     self.assertEqual(len(dataset.training_pairs(2, word_embedding)), 2)
예제 #14
0
파일: tests.py 프로젝트: maeda/polysemybot
 def test_train(self):
     word_embedding = WordEmbedding(source=self.__class__.dataset.pairs)
     self.assertEqual(word_embedding.n_words(), 25)
예제 #15
0
    return [int(label) for label in open(path)]


def cosine(x, y):
    # Cosine of angle between vectors x and y
    return x.dot(y) / np.linalg.norm(x) / np.linalg.norm(y)


if __name__ == '__main__':
    if len(sys.argv) != 6:
        print "Usage ./detect_relations.py vocab_file embedding_file train_data train_labels test_data"
        sys.exit(0)

    # Load vocab and embedding (these are not used yet!)
    vocab = Vocab(sys.argv[1])
    embedding = WordEmbedding(vocab, sys.argv[2])

    # Loads training data and labels.
    training_examples = load_example_sets(sys.argv[3])
    training_labels = load_labels(sys.argv[4])

    assert len(training_examples) == len(
        training_labels), "Expected one label for each line in training data."

    # Load test examples and labels each set of pairs as 'not a relation' (-1)
    # This is not a good idea... You can definitely do better!
    test_examples = load_example_sets(sys.argv[5])

    # Store displacement between embeddings for each label in train
    training_displacements = {lbl: [] for lbl in training_labels}
    for i, lbl in enumerate(training_labels):