Exemplo n.º 1
0
    def test_load_from_file(self):
        embeddings_path = os.path.join(settings.BASE_DIR, 'embeddings',
                                       uuid.uuid4().hex)
        filename = str(self.__class__.dataset.idx) + ".bin"

        word_embedding = WordEmbedding(source=self.__class__.dataset.pairs)
        word_embedding.train()
        word_embedding.save(embeddings_path, filename)

        model = WordEmbedding(source=os.path.join(embeddings_path, filename))
        print(model._embedding.wv.similarity('batendo', 'porta'))
Exemplo n.º 2
0
def retrain():
    ds = process(PreProcessing('./data/starwars.txt'))

    word_embedding = WordEmbedding(source='./embedding/FT/fasttext_cbow_300d.bin')

    word_embedding.train(ds.pairs)
    word_embedding.save('./embedding/starwars', 'starwars.bin')
Exemplo n.º 3
0
def train():
    ds = process(PreProcessing(open('./data/starwars.txt', 'r')))

    word_embedding = WordEmbedding(source=ds.pairs)

    word_embedding.train(ds.pairs)

    word_embedding.save(target_folder='./embedding/starwars', filename='starwars.bin')
Exemplo n.º 4
0
    def setUpClass(cls):
        cls.pre_processing = PreProcessing(sentences)
        cls.dataset = ds.process(cls.pre_processing)
        cls.word_embedding = WordEmbedding(source=cls.dataset.pairs)

        encoder = EncoderRNN(cls.word_embedding, 300, 1).to(settings.device)
        decoder = DecoderRNN(300, cls.word_embedding, 0.0,
                             1).to(settings.device)
        cls.model = Model(encoder, decoder)
        cls.model.train(cls.dataset)
Exemplo n.º 5
0
    def __init__(self, hidden_size, output_size, device='cuda'):
        super(TestModelEmbedding, self).__init__()
        self.embedding = WordEmbedding()
        self.embedding.set_elmo()
        self.embedding.set_glove(GLOVE_TRAIN_FILE)
        self.embedding.set_bert()

        self._l1 = nn.Linear(1324 * 4, hidden_size)
        self._l2 = nn.Linear(hidden_size, output_size)

        self.to(device)
Exemplo n.º 6
0
def run(hidden,
        layer,
        dropout,
        learning_rate,
        iteration,
        save,
        train=None,
        test=None):
    if train:
        dataset_id = train.split('/')[-1].split('.')[0]

        pre_processing = PreProcessing(open(train, 'r'), dataset_id)
        dataset = process(pre_processing)

        encoder_embeddings = WordEmbedding(source=dataset.pairs)
        decoder_embeddings = WordEmbedding(source=dataset.pairs)

        encoder = EncoderRNN(encoder_embeddings, hidden,
                             layer).to(settings.device)
        decoder = DecoderRNN(hidden, decoder_embeddings, dropout,
                             layer).to(settings.device)

        model = Model(
            encoder=encoder,
            decoder=decoder,
            learning_rate=learning_rate,
        )
        model.summary()
        model.train(dataset, n_iter=iteration, save_every=save)

    if test:

        dataset = load(test)

        model = Model.load(test)

        while True:
            decoded_words = model.evaluate(str(input("> ")), dataset)
            print(' '.join(decoded_words))
Exemplo n.º 7
0
    def __init__(self,
                 device,
                 pos_classes=45,
                 metaphor_classes=2,
                 snli_classes=3,
                 lstm_hidden_size=100,
                 dropout=0,
                 embedding_model="ELMo+GloVe"):
        super(JMTModel, self).__init__()

        if embedding_model in ["ELMo2+GloVe", "ELMo3+GloVe"]:
            self.embedding = WordEmbedding(device)
            self.embedding.set_elmo("23" if embedding_model ==
                                    "ELMo2+GloVe" else "123")
            # self.embedding.set_bert()
            self.embedding.set_glove()
            embedding_size = 1324
        else:
            self.embedding = BertEmbedding(embedding_model, device)
            embedding_size = self.embedding.embedding_size

        self.pos_lstm = nn.LSTM(embedding_size,
                                lstm_hidden_size,
                                1,
                                bidirectional=True,
                                dropout=dropout,
                                batch_first=True)
        self.pos_classifier = nn.Linear(2 * lstm_hidden_size, pos_classes)

        self.metaphor_lstm = nn.LSTM(embedding_size + 2 * lstm_hidden_size +
                                     pos_classes,
                                     lstm_hidden_size,
                                     1,
                                     bidirectional=True,
                                     dropout=dropout,
                                     batch_first=True)
        self.metaphor_classifier = nn.Linear(2 * lstm_hidden_size,
                                             metaphor_classes)

        self.snli_lstm = nn.LSTM(embedding_size + 4 * lstm_hidden_size +
                                 metaphor_classes,
                                 lstm_hidden_size,
                                 1,
                                 bidirectional=True,
                                 dropout=dropout,
                                 batch_first=True)
        self.snli_classifier = nn.Linear(2 * lstm_hidden_size * 4,
                                         snli_classes)
Exemplo n.º 8
0
 def __init__(self, mix_parameters=[1 / 3, 1 / 3, 1 / 3]):
     super(BaselineElmo, self).__init__()
     # make embedding
     self.embedding = WordEmbedding()
     self.embedding.set_elmo(mix_parameters=mix_parameters)
Exemplo n.º 9
0
            for line in open(path)]


def load_labels(path):
    # Loads a label for each line (-1 indicates the pairs do not form a relation).
    return [int(label) for label in open(path)]


if __name__ == '__main__':
    if len(sys.argv) != 6:
        print "Usage ./detect_relations.py vocab_file embedding_file train_data train_labels test_data"
        sys.exit(0)

    # Load vocab and embedding (these are not used yet!)
    vocab = Vocab(sys.argv[1])
    embedding = WordEmbedding(vocab, sys.argv[2])
    model = Model(vocab, embedding)

    # Loads training data and labels.
    training_examples = load_example_sets(sys.argv[3])
    training_labels = load_labels(sys.argv[4])
    assert len(training_examples) == len(
        training_labels), "Expected one label for each line in training data."

    # Training the model
    train_diffs_means = []  # model params
    train_diffs_stds = []  # other model params
    for training_example, training_label in zip(training_examples,
                                                training_labels):
        diffs = []
        for w1, w2 in training_example:
Exemplo n.º 10
0
 def test_should_generate_training_pairs(self):
     pre_processing = PreProcessing(sentences)
     dataset = ds.process(pre_processing)
     word_embedding = WordEmbedding(freeze=False, source=dataset.pairs)
     word_embedding.train()
     self.assertEqual(len(dataset.training_pairs(2, word_embedding)), 2)
Exemplo n.º 11
0
 def test_train(self):
     word_embedding = WordEmbedding(source=self.__class__.dataset.pairs)
     self.assertEqual(word_embedding.n_words(), 25)