示例#1
0
def main():
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=CharacterTokenizer(),
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv')
    validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={'tokens': 3, 'target_tokens': 3})

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
    # encoder = PytorchSeq2SeqWrapper(
    #     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

    # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
    # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
    attention = DotProductAttention()

    max_decoding_steps = 20   # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=1,
                      cuda_device=CUDA_DEVICE)

    for i in range(50):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
示例#2
0
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.predictors import SimpleSeq2SeqPredictor
from allennlp.training.trainer import Trainer

EN_EMBEDDING_DIM = 256
ZH_EMBEDDING_DIM = 256
HIDDEN_DIM = 256

CUDA_DEVICE = 0

#def main():
reader = Seq2SeqDatasetReader(
    source_tokenizer=WordTokenizer(),
    target_tokenizer=CharacterTokenizer(),
    source_token_indexers={'tokens': SingleIdTokenIndexer()},
    target_token_indexers={
        'tokens': SingleIdTokenIndexer(namespace='target_tokens')
    })
train_dataset = reader.read('/.../en_el_train.txt')
validation_dataset = reader.read('/.../en_el_dev.txt')

vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                  min_count={
                                      'tokens': 3,
                                      'target_tokens': 3
                                  })

en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                         embedding_dim=EN_EMBEDDING_DIM)
# encoder = PytorchSeq2SeqWrapper(
示例#3
0
def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),
        source_token_indexers={'tokens': elmo_token_indexer},
        target_token_indexers={
            'tokens': SingleIdTokenIndexer(namespace='target_tokens')
        })

    train_dataset, test_dataset, dev_dataset = (
        reader.read(DATA_ROOT + "/" + fname) for fname in
        ["train_all_seq.txt", "test_all_seq.txt", "val_all_seq.txt"])

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset +
                                      test_dataset,
                                      min_count={
                                          'tokens': 1,
                                          'target_tokens': 1
                                      })

    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    #                              embedding_dim=256)
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=elmo_embedding_dim)
    #elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5)
    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    # word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder})
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=256)
    source_embedder = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    #Initializing the model
    max_decoding_steps = 20
    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True))

    # encoder = StackedSelfAttentionEncoder(input_dim=elmo_embedding_dim, hidden_dim=hidden_dim, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)
    attention = DotProductAttention()

    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps,
                          target_embedding_dim=elmo_embedding_dim,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)

    if USE_GPU: model.cuda()
    else: model

    # Training the model
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=1,
                      cuda_device=0 if USE_GPU else -1)

    for i in range(20):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(dev_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:',
                  predictor.predict_instance(instance)['predicted_tokens'])

    #Saving the model
    with open("model_seq2seq.th", 'wb') as f:
        torch.save(model.state_dict(), f)

    vocab.save_to_files("vocabulary_seq2seq")
    predictor = SimpleSeq2SeqPredictor(model, reader)
    with open('predict_seq2seq.txt', 'w+') as f:
        for instance in itertools.islice(test_dataset, 10):
            preds = predictor.predict_instance(instance)['predicted_tokens']
            f.write(" ".join(preds) + "\n")
<<<<<<< HEAD
def main(name_file='all_f1', train_dir='all', test_dir='test', dir_files='data/disambiguation/', dir_results='results_2/', max_length=120, cuda_id=0, cuda=True, n_epochs=9, seed=0, lr=0.0001):
=======
def main(name_file='all_f1', train_dir='all', test_dir='test', dir_files='data/disambiguation/', dir_results='results/', max_length=120, cuda_id=0, cuda=True, n_epochs=9, seed=0):
>>>>>>> b4396c75b27e3d4f8680ea6762d7f2c530382768
    
    dir_train = os.path.join(dir_files, train_dir)
    dir_test = os.path.join(dir_files, test_dir)
    dir_results = os.path.join(dir_results, train_dir, name_file)
    os.makedirs(dir_results, exist_ok=True)
    
    input_lang, output_lang, pairs_train, pairs_test, senses_per_sentence = prepare_data(name_file, 'verbs_selected_lemma', max_length=max_length, dir_train=dir_train, dir_test=dir_test)
    selected_synsets = np.load(os.path.join(dir_files, 'selected_synsets.npy'))

    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),
<<<<<<< HEAD
=======
        delimiter=',',
>>>>>>> b4396c75b27e3d4f8680ea6762d7f2c530382768
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    train_dataset = reader.read(os.path.join(dir_train, name_file + '.tsv'))
    validation_dataset = reader.read(os.path.join(dir_test, 'verbs_selected_lemma.tsv'))

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={'tokens': 3, 'target_tokens': 3})

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
示例#5
0
def main():

    trainFile = "../srcData/trainData.csv"
    validFile = "../srcData/devData.csv"
    testFile = "../srcData/testData.csv"
    trainSeq2SeqFile = data.dataPreparation(trainFile)
    validSeq2SeqFile = data.dataPreparation(validFile)
    testSeq2SeqFile = data.dataPreparation(testFile)
    print(testSeq2SeqFile)
    #TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model
    #SingleIdTokenIndexer = Tokens are single integers
    #TokenCharactersIndexer = Tokens as a list of integers
    # Read a tsvfile with paired instances (source, target)
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),  # Defaults to source_tokenizer
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer()
                               }  # Defaults to source_token_indexers
    )

    # Each of the dataset is a list of each tokens (source_tokens, target_tokens)
    train_dataset = reader.read(trainSeq2SeqFile)
    validation_dataset = reader.read(validSeq2SeqFile)
    test_dataset = reader.read(testSeq2SeqFile)

    # Finding extra fact2 vocab
    trainExtraVocab = findExtraVocab(train_dataset)
    validExtraVocab = findExtraVocab(validation_dataset)
    testExtraVocab = findExtraVocab(test_dataset)
    finalExtraVocab = list(
        set(trainExtraVocab + validExtraVocab + testExtraVocab))
    print("length:", len(finalExtraVocab))
    #input()

    #vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3})
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset +
                                      test_dataset)
    # Train + Valid = 9703
    # Train + Valid + Test = 10099

    print("Vocab SIze :", vocab.get_vocab_size('tokens'))

    encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=ENC_EMBEDDING_DIM)

    # Embedding for tokens since in the dataset creation time it is mentioned tokens
    source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding})

    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(ENC_EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      dropout=0.2))

    attention = DotProductAttention()

    max_decoding_steps = 4  # TODO: make this variable
    model = SimpleSeq2Seq(
        vocab,
        source_embedder,
        encoder,
        max_decoding_steps,
        target_embedding_dim=TGT_EMBEDDING_DIM,
        #target_namespace = 'target_tokens',
        attention=attention,
        beam_size=beamSize,
        use_bleu=True,
        extra_vocab=finalExtraVocab)
    #Can also specify lr=0.001
    optimizer = optim.Adam(model.parameters())

    # Data Iterator that specify how to batch our dataset
    # Takes data shuffles it and creates fixed sized batches
    #iterator = BasicIterator(batch_size=2)
    #iterator.index_with(vocab)
    # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations
    iterator = BucketIterator(batch_size=50,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        #patience = 3,
        num_epochs=numEpochs,
        cuda_device=CUDA_DEVICE)

    trainer.train()
    predictor = SimpleSeq2SeqPredictor(model, reader)
    '''for i in range(2):
        print ("Epoch: {}".format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)


        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
            """'{'predictions': [[1, 4, 5, 92, 8, 6, 1, 8, 6, 26, 3]], 
             'loss': 5.9835076332092285,
             'class_log_probabilities': [-20.10894012451172],
             'predicted_tokens': ['@@UNKNOWN@@', 'is', 'a', 'type', 'of', 'the', '@@UNKNOWN@@', 'of', 'the', 'sun']}
             """
            print (predictor.predict_instance(instance))
    '''

    outFile = open(
        "output_" + str(HIDDEN_DIM) + "_" + str(numEpochs) + "_" +
        str(beamSize) + ".csv", "w")
    writer = csv.writer(outFile, delimiter="\t")
    for instance in itertools.islice(test_dataset, 500):
        src = instance.fields['source_tokens'].tokens
        gold = instance.fields['target_tokens'].tokens
        pred = predictor.predict_instance(instance)['predicted_tokens']
        writer.writerow([src, gold, pred])

    outFile.close()
示例#6
0
    def __init__(self, training=False):
        self.training = training
        config = conf['seq2seq_allen']
        self.model_path = config['model_path']
        self.vocab_path = config['vocab_path']
        prefix = config['processed_data_prefix']
        train_file = config['train_data']
        valid_file = config['test_data']
        src_embedding_dim = config['src_embedding_dim']
        trg_embedding_dim = config['trg_embedding_dim']
        hidden_dim = config['hidden_dim']
        epoch = config['epoch']
        patience = config['patience']

        if torch.cuda.is_available():
            self.cuda_device = 0
        else:
            self.cuda_device = -1

        self.reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(),
            target_tokenizer=WordTokenizer(),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={'tokens': SingleIdTokenIndexer()})

        if self.training:
            self.train_dataset = self.reader.read(
                os.path.join(prefix, train_file))
            self.valid_dataset = self.reader.read(
                os.path.join(prefix, valid_file))

            self.vocab = Vocabulary.from_instances(self.train_dataset +
                                                   self.valid_dataset,
                                                   min_count={'tokens': 3})
        else:
            self.vocab = Vocabulary.from_files(self.vocab_path)

        src_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size('tokens'),
            embedding_dim=src_embedding_dim)

        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True))

        source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding})

        self.model = SimpleSeq2Seq(vocab=self.vocab,
                                   source_embedder=source_embedder,
                                   encoder=encoder,
                                   max_decoding_steps=20,
                                   target_embedding_dim=trg_embedding_dim,
                                   use_bleu=True)

        optimizer = optim.Adam(self.model.parameters())
        iterator = BucketIterator(batch_size=32,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
        # 迭代器需要接受vocab,在训练时可以用vocab来index数据
        iterator.index_with(self.vocab)

        self.model.cuda(self.cuda_device)

        if training:
            self.trainer = Trainer(model=self.model,
                                   optimizer=optimizer,
                                   iterator=iterator,
                                   patience=patience,
                                   train_dataset=self.train_dataset,
                                   validation_dataset=self.valid_dataset,
                                   serialization_dir=self.model_path,
                                   num_epochs=epoch,
                                   cuda_device=self.cuda_device)

        if not self.training:
            with open(os.path.join(self.model_path, 'best.th'), 'rb') as f:
                self.model.load_state_dict(torch.load(f))
            self.model.cuda(self.cuda_device)
            self.model.training = self.training
            self.predictor = Seq2SeqPredictor(self.model,
                                              dataset_reader=self.reader)
示例#7
0
def main():
    target_namespace = "target_tokens"
    if not USE_COPY:
        reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace=target_namespace)
            })
    else:
        reader = CopyNetDatasetReader(
            source_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_namespace=target_namespace)
    train_dataset = reader.read('./data/data_train.tsv')
    validation_dataset = reader.read('./data/data_val.tsv')

    vocab = Vocabulary.from_instances(train_dataset,
                                      min_count={
                                          'tokens': 3,
                                          'target_tokens': 3
                                      })

    en_embedding = Embedding(
        num_embeddings=vocab.get_vocab_size('tokens'),
        embedding_dim=SRC_EMBEDDING_DIM,
        pretrained_file="../opennmt/glove_dir/glove.840B.300d.txt")
    assert en_embedding.weight.requires_grad
    datas = _read_pretrained_embeddings_file(en_embedding._pretrained_file,
                                             SRC_EMBEDDING_DIM, vocab)
    datas.requires_grad = True
    en_embedding.weight.data = datas
    print(en_embedding.weight.data)
    assert en_embedding.weight.requires_grad
    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(SRC_EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      bidirectional=True,
                      dropout=0.3,
                      num_layers=1))
    #encoder = StackedSelfAttentionEncoder(input_dim=SRC_EMBEDDING_DIM,
    #                                      hidden_dim=HIDDEN_DIM,
    #                                      projection_dim=128, feedforward_hidden_dim=128,
    #                                      num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})
    attention = DotProductAttention()

    if not USE_COPY:
        model = SimpleSeq2Seq(vocab,
                              source_embedder,
                              encoder,
                              MAX_DECODING_STEPS,
                              target_embedding_dim=TGT_EMBEDDING_DIM,
                              target_namespace='target_tokens',
                              attention=attention,
                              beam_size=8,
                              use_bleu=True)
    else:
        model = MyCopyNet(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps=MAX_DECODING_STEPS,
                          target_embedding_dim=TGT_EMBEDDING_DIM,
                          target_namespace=target_namespace,
                          attention=attention,
                          beam_size=8,
                          tgt_embedder_pretrain_file=
                          "../opennmt/glove_dir/glove.840B.300d.txt")
    model.to(torch.device('cuda'))
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=64,
                              sorting_keys=[("source_tokens", "num_tokens")],
                              padding_noise=0.2)

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=22,
                      patience=4,
                      serialization_dir="./checkpoints",
                      cuda_device=CUDA_DEVICE,
                      summary_interval=100)
    trainer.train()
    print(en_embedding.weight.data)
    predictor = Seq2SeqPredictor(model, reader)

    # Dump all predictions to a file
    # TODO (DNGros): Is there an automatic way in allennlp to do this??
    pred_toks = []
    with open("pred.txt", "w") as outfile:
        for instance in tqdm(validation_dataset):
            pred = predictor.predict_instance(instance)
            toks = pred['predicted_tokens']
            if toks:
                outfile.write(" ".join(toks[0]) + "\n")
            else:
                outfile.write("" + "\n")
示例#8
0
    def __init__(self):
        config = conf['seq2seq_allen']
        prefix = config['processed_data_prefix']
        train_file = config['train_data']
        valid_file = config['valid_data']
        src_embedding_dim = config['src_embedding_dim']
        trg_embedding_dim = config['trg_embedding_dim']
        hidden_dim = config['hidden_dim']

        if torch.cuda.is_available():
            cuda_device = 0
        else:
            cuda_device = -1

        self.reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(),
            target_tokenizer=WordTokenizer(),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace='target_tokens')
            })

        self.train_dataset = self.reader.read(os.path.join(prefix, train_file))
        self.valid_dataset = self.reader.read(os.path.join(prefix, valid_file))

        vocab = Vocabulary.from_instances(self.train_dataset +
                                          self.valid_dataset,
                                          min_count={
                                              'tokens': 3,
                                              'target_tokens': 3
                                          })

        src_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=src_embedding_dim)

        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True))

        source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding})

        attention = LinearAttention(hidden_dim,
                                    hidden_dim,
                                    activation=Activation.by_name('tanh')())

        self.model = SimpleSeq2Seq(
            vocab=vocab,
            source_embedder=source_embedder,
            encoder=encoder,
            max_decoding_steps=20,
            target_embedding_dim=trg_embedding_dim,
            target_namespace='target_tokens',
            attention=attention,  # pass attention
            use_bleu=True)

        optimizer = optim.Adam(self.model.parameters())
        iterator = BucketIterator(batch_size=32,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
        # 迭代器需要接受vocab,在训练时可以用vocab来index数据
        iterator.index_with(vocab)

        self.model.cuda(cuda_device)

        self.trainer = Trainer(model=self.model,
                               optimizer=optimizer,
                               iterator=iterator,
                               patience=10,
                               validation_metric="+accuracy",
                               train_dataset=self.train_dataset,
                               validation_dataset=self.valid_dataset,
                               num_epochs=1,
                               cuda_device=cuda_device)