예제 #1
0
파일: dataset.py 프로젝트: flauted/langID
 def __init__(self, data_dir, transform=None):
     self.transform = transform
     eng_sents, fra_sents = utils.read_langs(data_dir)
     self.input_data = eng_sents + fra_sents
     self.targets = np.concatenate(
         (np.zeros(len(eng_sents), dtype=np.int64),
          np.ones(len(fra_sents), dtype=np.int64)), 0)
     self.vocab = utils.Vocab("all_data")
     for sent in self.input_data:
         self.vocab.add_sentence(sent)
     self.input_idxs = [
         utils.indexes_from_sentence(self.vocab, sent)
         for sent in self.input_data
     ]
예제 #2
0
 def test_read_lang_sents_reversed(self):
     sents = utils.read_langs(self._data_dir, pairs=False, reverse=True)
     english = [li.split("\t")[0] for li in self._text.split("\n")]
     spanish = [li.split("\t")[1] for li in self._text.split("\n")]
     self.assertEqual(sents, [spanish, english])
예제 #3
0
 def test_read_lang_pairs_reversed(self):
     pairs = utils.read_langs(self._data_dir, pairs=True, reverse=True)
     english = [li.split("\t")[0] for li in self._text.split("\n")]
     spanish = [li.split("\t")[1] for li in self._text.split("\n")]
     self.assertEqual(pairs, [[sp, en] for en, sp in zip(english, spanish)])
예제 #4
0
def main():
    input_lang, output_lang, pairs, data1, data2 = read_langs("eng", "fra", True)
    input_tensor = [[input_lang.word2index[s] for s in es.split(' ')] for es in data1]
    target_tensor = [[output_lang.word2index[s] for s in es.split(' ')] for es in data2]
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

    input_tensor = [pad_sequences(x, max_length_inp) for x in input_tensor]
    target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor]
    print(len(target_tensor))

    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor,
                                                                                                    target_tensor,
                                                                                                    test_size=0.2)

    # Show length
    print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

    BUFFER_SIZE = len(input_tensor_train)
    BATCH_SIZE = 64
    N_BATCH = BUFFER_SIZE // BATCH_SIZE
    embedding_dim = 256
    units = 1024
    vocab_inp_size = len(input_lang.word2index)
    vocab_tar_size = len(output_lang.word2index)

    train_dataset = MyData(input_tensor_train, target_tensor_train)
    val_dataset = MyData(input_tensor_val, target_tensor_val)

    dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                         drop_last=True,
                         shuffle=True)

    device = torch.device("cpu")

    encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
    decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)

    encoder.to(device)
    decoder.to(device)

    criterion = nn.CrossEntropyLoss()

    optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()),
                           lr=0.001)

    EPOCHS = 10

    for epoch in range(EPOCHS):
        start = time()

        encoder.train()
        decoder.train()

        total_loss = 0

        for (batch, (inp, targ, inp_len)) in enumerate(dataset):
            loss = 0

            xs, ys, lens = sort_batch(inp, targ, inp_len)
            enc_output, enc_hidden = encoder(xs.to(device), lens, device)
            dec_hidden = enc_hidden
            dec_input = torch.tensor([[output_lang.word2index['<sos>']]] * BATCH_SIZE)

            for t in range(1, ys.size(1)):
                predictions, dec_hidden, _ = decoder(dec_input.to(device),
                                                     dec_hidden.to(device),
                                                     enc_output.to(device))
                loss += loss_function(criterion, ys[:, t].to(device), predictions.to(device))
                # loss += loss_
                dec_input = ys[:, t].unsqueeze(1)

            batch_loss = (loss / int(ys.size(1)))
            total_loss += batch_loss

            optimizer.zero_grad()

            loss.backward()

            ### UPDATE MODEL PARAMETERS
            optimizer.step()

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                             batch,
                                                             batch_loss.detach().item()))

        ### TODO: Save checkpoint for model
        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                            total_loss / N_BATCH))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))