예제 #1
0
 def _makeToyTaggerData(self):
     data_config = {}
     features_file = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "src.txt"),
         ["M . Smith went to Washington .", "I live in New Zealand ."],
     )
     labels_file = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "labels.txt"),
         ["B-PER I-PER E-PER O O S-LOC O", "O O O B-LOC E-LOC O"],
     )
     data_config["source_vocabulary"] = test_util.make_vocab_from_file(
         os.path.join(self.get_temp_dir(), "src_vocab.txt"), features_file)
     data_config["target_vocabulary"] = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "labels_vocab.txt"),
         [
             "O",
             "B-LOC",
             "I-LOC",
             "E-LOC",
             "S-LOC",
             "B-PER",
             "I-PER",
             "E-PER",
             "S-PER",
         ],
     )
     return features_file, labels_file, data_config
예제 #2
0
 def _makeToyEnDeData(self, with_alignments=False):
   data_config = {}
   features_file = test_util.make_data_file(
       os.path.join(self.get_temp_dir(), "src.txt"),
       ["Parliament Does Not Support Amendment Freeing Tymoshenko",
        "Today , the Ukraine parliament dismissed , within the Code of Criminal Procedure "
        "amendment , the motion to revoke an article based on which the opposition leader , "
        "Yulia Tymoshenko , was sentenced .",
        "The amendment that would lead to freeing the imprisoned former Prime Minister was "
        "revoked during second reading of the proposal for mitigation of sentences for "
        "economic offences ."])
   labels_file = test_util.make_data_file(
       os.path.join(self.get_temp_dir(), "tgt.txt"),
       ["Keine befreiende Novelle für Tymoshenko durch das Parlament",
        "Das ukrainische Parlament verweigerte heute den Antrag , im Rahmen einer Novelle "
        "des Strafgesetzbuches denjenigen Paragrafen abzuschaffen , auf dessen Grundlage die "
        "Oppositionsführerin Yulia Timoshenko verurteilt worden war .",
        "Die Neuregelung , die den Weg zur Befreiung der inhaftierten Expremierministerin hätte "
        "ebnen können , lehnten die Abgeordneten bei der zweiten Lesung des Antrags auf Milderung "
        "der Strafen für wirtschaftliche Delikte ab ."])
   data_config["source_vocabulary"] = test_util.make_vocab_from_file(
       os.path.join(self.get_temp_dir(), "src_vocab.txt"), features_file)
   data_config["target_vocabulary"] = test_util.make_vocab_from_file(
       os.path.join(self.get_temp_dir(), "tgt_vocab.txt"), labels_file)
   if with_alignments:
     # Dummy and incomplete alignments.
     data_config["train_alignments"] = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "aligne.txt"),
         ["0-0 1-0 2-2 3-4 4-4 5-6",
          "0-1 1-1 1-3 2-3 4-4",
          "0-0 1-0 2-2 3-4 4-4 5-6"])
   return features_file, labels_file, data_config
예제 #3
0
 def testSequenceToSequenceInputter(self):
     source_vocabulary = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "src_vocab.txt"),
         ["<blank>", "<s>", "</s>", "a", "b", "c", "d"],
     )
     target_vocabulary = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "tgt_vocab.txt"),
         ["<blank>", "<s>", "</s>", "e", "f", "g", "h"],
     )
     source_file = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "src.txt"),
         ["a c c", "b d", "a e"])
     target_file = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "tgt.txt"),
         ["f h g", "e h", "a e"])
     inputter = sequence_to_sequence.SequenceToSequenceInputter(
         text_inputter.WordEmbedder(embedding_size=20),
         text_inputter.WordEmbedder(embedding_size=20),
     )
     inputter.initialize(
         dict(source_vocabulary=source_vocabulary,
              target_vocabulary=target_vocabulary))
     dataset = inputter.make_dataset([source_file, target_file])
     element = iter(dataset).next()
     features, labels = inputter.make_features(element)
     self.assertIn("ids_out", labels)
     self.assertAllEqual(labels["ids"], [1, 4, 6, 5])
     self.assertAllEqual(labels["ids_out"], [4, 6, 5, 2])
     self.assertEqual(labels["length"], 4)
예제 #4
0
 def _makeToyClassifierData(self):
   data_config = {}
   features_file = test_util.make_data_file(
       os.path.join(self.get_temp_dir(), "src.txt"),
       ["This product was not good at all , it broke on the first use !",
        "Perfect , it does everything I need .",
        "How do I change the battery ?"])
   labels_file = test_util.make_data_file(
       os.path.join(self.get_temp_dir(), "labels.txt"), ["negative", "positive", "neutral"])
   data_config["source_vocabulary"] = test_util.make_vocab_from_file(
       os.path.join(self.get_temp_dir(), "src_vocab.txt"), features_file)
   data_config["target_vocabulary"] = test_util.make_data_file(
       os.path.join(self.get_temp_dir(), "labels_vocab.txt"), ["negative", "positive", "neutral"])
   return features_file, labels_file, data_config
예제 #5
0
 def testDatasetSize(self):
     path = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "file.txt"),
         list(map(str, range(15))))
     dataset = tf.data.TextLineDataset(path)
     size = dataset_util.get_dataset_size(dataset)
     self.assertEqual(self.evaluate(size), 15)
예제 #6
0
 def testTrainLanguageModel(self):
     src = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "src.txt"),
         ["1 2 3 4", "5 6 7 8 9", "3 2"])
     vocab = test_util.make_vocab(
         os.path.join(self.get_temp_dir(), "vocab.txt"),
         list(map(str, range(10))))
     config = {
         "data": {
             "train_features_file": src,
             "vocabulary": vocab,
         },
         "params": {
             "learning_rate": 0.0005,
             "optimizer": "Adam"
         },
         "train": {
             "batch_size": 10,
             "max_step": 2,
         },
     }
     model = models.LanguageModel(decoders.SelfAttentionDecoder(
         2, num_units=32, ffn_inner_dim=32),
                                  embedding_size=16,
                                  reuse_embedding=False)
     runner = Runner(model, config)
     runner.train()
예제 #7
0
 def _makeTransliterationData(self):
     ar = [
         "آ ت ز م و ن",
         "آ ت ش ي س و ن",
         "آ ر ب ا ك ه",
         "آ ر ث ر",
         "آ ز ا",
     ]
     en = ["a t z m o n", "a c h e s o n", "a a r b a k k e", "a r t h u r", "a s a"]
     ar_file = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "ar.txt"), ar
     )
     en_file = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "en.txt"), en
     )
     return ar_file, en_file
예제 #8
0
 def _make_model(name, src_vocab, tgt_vocab, random_slots=False):
   model, _ = _seq2seq_model(training=True)
   optimizer = tf.keras.optimizers.Adam()
   data = {}
   data["source_vocabulary"] = test_util.make_data_file(
       os.path.join(self.get_temp_dir(), "%s-src-vocab.txt" % name),
       src_vocab)
   data["target_vocabulary"] = test_util.make_data_file(
       os.path.join(self.get_temp_dir(), "%s-tgt-vocab.txt" % name),
       tgt_vocab)
   model.initialize(data)
   model.create_variables(optimizer=optimizer)
   if random_slots:
     for variable in model.trainable_variables:
       for slot_name in optimizer.get_slot_names():
         slot = optimizer.get_slot(variable, slot_name)
         slot.assign(tf.random.uniform(slot.shape))
   return model, optimizer
예제 #9
0
    def testLoadSentencePieceVocab(self):
        vocab_path = test_util.make_data_file(
            os.path.join(self.get_temp_dir(), "vocab_sp"), [
                "<unk>	0", "<s>	0", "</s>	0", ",	-3.0326", ".	-3.41093",
                "▁the	-3.85169", "s	-4.05468", "▁die	-4.15914", "▁in	-4.2419",
                "▁der	-4.36135"
            ])

        vocab = Vocab(from_file=vocab_path, from_format="sentencepiece")
        self.assertEqual(len(vocab), 7)
        self.assertNotIn("<unk>", vocab)
        self.assertNotIn("<s>", vocab)
        self.assertNotIn("</s>", vocab)
        self.assertIn("▁the", vocab)
예제 #10
0
def _create_dataset(model, temp_dir):
    data_path = os.path.join(temp_dir, "data.txt")
    test_util.make_data_file(data_path, ["a a a b b d", "a b b b", "c c"])
    dataset = model.examples_inputter.make_inference_dataset(data_path, 1)
    return dataset
예제 #11
0
 def _run_scorer(self, scorer, refs, hyps):
     ref_path = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "ref.txt"), refs)
     hyp_path = test_util.make_data_file(
         os.path.join(self.get_temp_dir(), "hyp.txt"), hyps)
     return scorer(ref_path, hyp_path)