def main(): reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')}) train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv') validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv') vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) # encoder = PytorchSeq2SeqWrapper( # torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')()) # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM) attention = DotProductAttention() max_decoding_steps = 20 # TODO: make this variable model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=1, cuda_device=CUDA_DEVICE) for i in range(50): print('Epoch: {}'.format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder from allennlp.modules.token_embedders import Embedding from allennlp.predictors import SimpleSeq2SeqPredictor from allennlp.training.trainer import Trainer EN_EMBEDDING_DIM = 256 ZH_EMBEDDING_DIM = 256 HIDDEN_DIM = 256 CUDA_DEVICE = 0 #def main(): reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='target_tokens') }) train_dataset = reader.read('/.../en_el_train.txt') validation_dataset = reader.read('/.../en_el_dev.txt') vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={ 'tokens': 3, 'target_tokens': 3 }) en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) # encoder = PytorchSeq2SeqWrapper(
def main(): elmo_token_indexer = ELMoTokenCharactersIndexer() reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), source_token_indexers={'tokens': elmo_token_indexer}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='target_tokens') }) train_dataset, test_dataset, dev_dataset = ( reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all_seq.txt", "test_all_seq.txt", "val_all_seq.txt"]) vocab = Vocabulary.from_instances(train_dataset + dev_dataset + test_dataset, min_count={ 'tokens': 1, 'target_tokens': 1 }) # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), # embedding_dim=256) # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), # embedding_dim=elmo_embedding_dim) #elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) # word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder}) # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), # embedding_dim=256) source_embedder = BasicTextFieldEmbedder({"tokens": elmo_embedder}) #Initializing the model max_decoding_steps = 20 encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True)) # encoder = StackedSelfAttentionEncoder(input_dim=elmo_embedding_dim, hidden_dim=hidden_dim, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) attention = DotProductAttention() model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=elmo_embedding_dim, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) if USE_GPU: model.cuda() else: model # Training the model optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=1, cuda_device=0 if USE_GPU else -1) for i in range(20): print('Epoch: {}'.format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(dev_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens']) #Saving the model with open("model_seq2seq.th", 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("vocabulary_seq2seq") predictor = SimpleSeq2SeqPredictor(model, reader) with open('predict_seq2seq.txt', 'w+') as f: for instance in itertools.islice(test_dataset, 10): preds = predictor.predict_instance(instance)['predicted_tokens'] f.write(" ".join(preds) + "\n")
<<<<<<< HEAD def main(name_file='all_f1', train_dir='all', test_dir='test', dir_files='data/disambiguation/', dir_results='results_2/', max_length=120, cuda_id=0, cuda=True, n_epochs=9, seed=0, lr=0.0001): ======= def main(name_file='all_f1', train_dir='all', test_dir='test', dir_files='data/disambiguation/', dir_results='results/', max_length=120, cuda_id=0, cuda=True, n_epochs=9, seed=0): >>>>>>> b4396c75b27e3d4f8680ea6762d7f2c530382768 dir_train = os.path.join(dir_files, train_dir) dir_test = os.path.join(dir_files, test_dir) dir_results = os.path.join(dir_results, train_dir, name_file) os.makedirs(dir_results, exist_ok=True) input_lang, output_lang, pairs_train, pairs_test, senses_per_sentence = prepare_data(name_file, 'verbs_selected_lemma', max_length=max_length, dir_train=dir_train, dir_test=dir_test) selected_synsets = np.load(os.path.join(dir_files, 'selected_synsets.npy')) reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), <<<<<<< HEAD ======= delimiter=',', >>>>>>> b4396c75b27e3d4f8680ea6762d7f2c530382768 source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')}) train_dataset = reader.read(os.path.join(dir_train, name_file + '.tsv')) validation_dataset = reader.read(os.path.join(dir_test, 'verbs_selected_lemma.tsv')) vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM)
def main(): trainFile = "../srcData/trainData.csv" validFile = "../srcData/devData.csv" testFile = "../srcData/testData.csv" trainSeq2SeqFile = data.dataPreparation(trainFile) validSeq2SeqFile = data.dataPreparation(validFile) testSeq2SeqFile = data.dataPreparation(testFile) print(testSeq2SeqFile) #TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model #SingleIdTokenIndexer = Tokens are single integers #TokenCharactersIndexer = Tokens as a list of integers # Read a tsvfile with paired instances (source, target) reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), # Defaults to source_tokenizer source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer() } # Defaults to source_token_indexers ) # Each of the dataset is a list of each tokens (source_tokens, target_tokens) train_dataset = reader.read(trainSeq2SeqFile) validation_dataset = reader.read(validSeq2SeqFile) test_dataset = reader.read(testSeq2SeqFile) # Finding extra fact2 vocab trainExtraVocab = findExtraVocab(train_dataset) validExtraVocab = findExtraVocab(validation_dataset) testExtraVocab = findExtraVocab(test_dataset) finalExtraVocab = list( set(trainExtraVocab + validExtraVocab + testExtraVocab)) print("length:", len(finalExtraVocab)) #input() #vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) vocab = Vocabulary.from_instances(train_dataset + validation_dataset + test_dataset) # Train + Valid = 9703 # Train + Valid + Test = 10099 print("Vocab SIze :", vocab.get_vocab_size('tokens')) encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=ENC_EMBEDDING_DIM) # Embedding for tokens since in the dataset creation time it is mentioned tokens source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding}) encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(ENC_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, dropout=0.2)) attention = DotProductAttention() max_decoding_steps = 4 # TODO: make this variable model = SimpleSeq2Seq( vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=TGT_EMBEDDING_DIM, #target_namespace = 'target_tokens', attention=attention, beam_size=beamSize, use_bleu=True, extra_vocab=finalExtraVocab) #Can also specify lr=0.001 optimizer = optim.Adam(model.parameters()) # Data Iterator that specify how to batch our dataset # Takes data shuffles it and creates fixed sized batches #iterator = BasicIterator(batch_size=2) #iterator.index_with(vocab) # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations iterator = BucketIterator(batch_size=50, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, #patience = 3, num_epochs=numEpochs, cuda_device=CUDA_DEVICE) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) '''for i in range(2): print ("Epoch: {}".format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens']) """'{'predictions': [[1, 4, 5, 92, 8, 6, 1, 8, 6, 26, 3]], 'loss': 5.9835076332092285, 'class_log_probabilities': [-20.10894012451172], 'predicted_tokens': ['@@UNKNOWN@@', 'is', 'a', 'type', 'of', 'the', '@@UNKNOWN@@', 'of', 'the', 'sun']} """ print (predictor.predict_instance(instance)) ''' outFile = open( "output_" + str(HIDDEN_DIM) + "_" + str(numEpochs) + "_" + str(beamSize) + ".csv", "w") writer = csv.writer(outFile, delimiter="\t") for instance in itertools.islice(test_dataset, 500): src = instance.fields['source_tokens'].tokens gold = instance.fields['target_tokens'].tokens pred = predictor.predict_instance(instance)['predicted_tokens'] writer.writerow([src, gold, pred]) outFile.close()
def __init__(self, training=False): self.training = training config = conf['seq2seq_allen'] self.model_path = config['model_path'] self.vocab_path = config['vocab_path'] prefix = config['processed_data_prefix'] train_file = config['train_data'] valid_file = config['test_data'] src_embedding_dim = config['src_embedding_dim'] trg_embedding_dim = config['trg_embedding_dim'] hidden_dim = config['hidden_dim'] epoch = config['epoch'] patience = config['patience'] if torch.cuda.is_available(): self.cuda_device = 0 else: self.cuda_device = -1 self.reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer()}) if self.training: self.train_dataset = self.reader.read( os.path.join(prefix, train_file)) self.valid_dataset = self.reader.read( os.path.join(prefix, valid_file)) self.vocab = Vocabulary.from_instances(self.train_dataset + self.valid_dataset, min_count={'tokens': 3}) else: self.vocab = Vocabulary.from_files(self.vocab_path) src_embedding = Embedding( num_embeddings=self.vocab.get_vocab_size('tokens'), embedding_dim=src_embedding_dim) encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True)) source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding}) self.model = SimpleSeq2Seq(vocab=self.vocab, source_embedder=source_embedder, encoder=encoder, max_decoding_steps=20, target_embedding_dim=trg_embedding_dim, use_bleu=True) optimizer = optim.Adam(self.model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens") ]) # 迭代器需要接受vocab,在训练时可以用vocab来index数据 iterator.index_with(self.vocab) self.model.cuda(self.cuda_device) if training: self.trainer = Trainer(model=self.model, optimizer=optimizer, iterator=iterator, patience=patience, train_dataset=self.train_dataset, validation_dataset=self.valid_dataset, serialization_dir=self.model_path, num_epochs=epoch, cuda_device=self.cuda_device) if not self.training: with open(os.path.join(self.model_path, 'best.th'), 'rb') as f: self.model.load_state_dict(torch.load(f)) self.model.cuda(self.cuda_device) self.model.training = self.training self.predictor = Seq2SeqPredictor(self.model, dataset_reader=self.reader)
def main(): target_namespace = "target_tokens" if not USE_COPY: reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace=target_namespace) }) else: reader = CopyNetDatasetReader( source_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_namespace=target_namespace) train_dataset = reader.read('./data/data_train.tsv') validation_dataset = reader.read('./data/data_val.tsv') vocab = Vocabulary.from_instances(train_dataset, min_count={ 'tokens': 3, 'target_tokens': 3 }) en_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=SRC_EMBEDDING_DIM, pretrained_file="../opennmt/glove_dir/glove.840B.300d.txt") assert en_embedding.weight.requires_grad datas = _read_pretrained_embeddings_file(en_embedding._pretrained_file, SRC_EMBEDDING_DIM, vocab) datas.requires_grad = True en_embedding.weight.data = datas print(en_embedding.weight.data) assert en_embedding.weight.requires_grad encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(SRC_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True, dropout=0.3, num_layers=1)) #encoder = StackedSelfAttentionEncoder(input_dim=SRC_EMBEDDING_DIM, # hidden_dim=HIDDEN_DIM, # projection_dim=128, feedforward_hidden_dim=128, # num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) attention = DotProductAttention() if not USE_COPY: model = SimpleSeq2Seq(vocab, source_embedder, encoder, MAX_DECODING_STEPS, target_embedding_dim=TGT_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) else: model = MyCopyNet(vocab, source_embedder, encoder, max_decoding_steps=MAX_DECODING_STEPS, target_embedding_dim=TGT_EMBEDDING_DIM, target_namespace=target_namespace, attention=attention, beam_size=8, tgt_embedder_pretrain_file= "../opennmt/glove_dir/glove.840B.300d.txt") model.to(torch.device('cuda')) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=64, sorting_keys=[("source_tokens", "num_tokens")], padding_noise=0.2) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=22, patience=4, serialization_dir="./checkpoints", cuda_device=CUDA_DEVICE, summary_interval=100) trainer.train() print(en_embedding.weight.data) predictor = Seq2SeqPredictor(model, reader) # Dump all predictions to a file # TODO (DNGros): Is there an automatic way in allennlp to do this?? pred_toks = [] with open("pred.txt", "w") as outfile: for instance in tqdm(validation_dataset): pred = predictor.predict_instance(instance) toks = pred['predicted_tokens'] if toks: outfile.write(" ".join(toks[0]) + "\n") else: outfile.write("" + "\n")
def __init__(self): config = conf['seq2seq_allen'] prefix = config['processed_data_prefix'] train_file = config['train_data'] valid_file = config['valid_data'] src_embedding_dim = config['src_embedding_dim'] trg_embedding_dim = config['trg_embedding_dim'] hidden_dim = config['hidden_dim'] if torch.cuda.is_available(): cuda_device = 0 else: cuda_device = -1 self.reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='target_tokens') }) self.train_dataset = self.reader.read(os.path.join(prefix, train_file)) self.valid_dataset = self.reader.read(os.path.join(prefix, valid_file)) vocab = Vocabulary.from_instances(self.train_dataset + self.valid_dataset, min_count={ 'tokens': 3, 'target_tokens': 3 }) src_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=src_embedding_dim) encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True)) source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding}) attention = LinearAttention(hidden_dim, hidden_dim, activation=Activation.by_name('tanh')()) self.model = SimpleSeq2Seq( vocab=vocab, source_embedder=source_embedder, encoder=encoder, max_decoding_steps=20, target_embedding_dim=trg_embedding_dim, target_namespace='target_tokens', attention=attention, # pass attention use_bleu=True) optimizer = optim.Adam(self.model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens") ]) # 迭代器需要接受vocab,在训练时可以用vocab来index数据 iterator.index_with(vocab) self.model.cuda(cuda_device) self.trainer = Trainer(model=self.model, optimizer=optimizer, iterator=iterator, patience=10, validation_metric="+accuracy", train_dataset=self.train_dataset, validation_dataset=self.valid_dataset, num_epochs=1, cuda_device=cuda_device)