Exemplo n.º 1
0
    def test_from_params(self):
        # Save a vocab to check we can load it from_params.
        vocab_dir = self.TEST_DIR / "vocab_save"
        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_tokens_to_namespace(
            ["a0", "a1", "a2"], namespace="a"
        )  # non-padded, should start at 0
        vocab.add_tokens_to_namespace(["b2", "b3"], namespace="b")  # padded, should start at 2
        vocab.save_to_files(vocab_dir)

        params = Params({"type": "from_files", "directory": vocab_dir})
        vocab2 = Vocabulary.from_params(params)
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")

        # Test case where we build a vocab from a dataset.
        vocab2 = Vocabulary.from_params(Params({}), instances=self.dataset)
        assert vocab2.get_index_to_token_vocabulary("tokens") == {
            0: "@@PADDING@@",
            1: "@@UNKNOWN@@",
            2: "a",
            3: "c",
            4: "b",
        }
        # Test from_params raises when we have neither a dataset and a vocab_directory.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({}))

        # Test from_params raises when there are any other dict keys
        # present apart from 'directory' and we aren't calling from_dataset.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(
                Params({"type": "from_files", "directory": vocab_dir, "min_count": {"tokens": 2}})
            )
Exemplo n.º 2
0
    def test_extend_from_vocab(self):
        vocab1 = Vocabulary(non_padded_namespaces={"1", "2"})
        vocab2 = Vocabulary(non_padded_namespaces={"3"})

        vocab1.add_tokens_to_namespace(["a", "b", "c"], namespace="1")
        vocab1.add_tokens_to_namespace(["d", "e", "f"], namespace="2")

        vocab2.add_tokens_to_namespace(["c", "d", "e"], namespace="1")
        vocab2.add_tokens_to_namespace(["g", "h", "i"], namespace="3")

        vocab1.extend_from_vocab(vocab2)
        assert vocab1.get_namespaces() == {"1", "2", "3"}
        assert vocab1._non_padded_namespaces == {"1", "2", "3"}
        assert vocab1.get_token_to_index_vocabulary("1") == {
            "a": 0,
            "b": 1,
            "c": 2,
            "@@PADDING@@": 3,
            "@@UNKNOWN@@": 4,
            "d": 5,
            "e": 6,
        }
        assert vocab1.get_token_to_index_vocabulary("2") == {
            "d": 0,
            "e": 1,
            "f": 2,
        }
        assert vocab1.get_token_to_index_vocabulary("3") == {
            "g": 0,
            "h": 1,
            "i": 2,
        }
Exemplo n.º 3
0
    def test_custom_padding_oov_tokens(self):
        vocab = Vocabulary(oov_token="[UNK]")
        assert vocab._oov_token == "[UNK]"
        assert vocab._padding_token == "@@PADDING@@"

        vocab = Vocabulary(padding_token="[PAD]")
        assert vocab._oov_token == "@@UNKNOWN@@"
        assert vocab._padding_token == "[PAD]"

        vocab_dir = self.TEST_DIR / "vocab_save"
        vocab = Vocabulary(oov_token="<UNK>")
        vocab.add_tokens_to_namespace(["a0", "a1", "a2"], namespace="a")
        vocab.save_to_files(vocab_dir)

        params = Params({
            "type": "from_files",
            "directory": vocab_dir,
            "oov_token": "<UNK>"
        })
        vocab = Vocabulary.from_params(params)

        with pytest.raises(AssertionError) as excinfo:
            vocab = Vocabulary.from_params(
                Params({
                    "type": "from_files",
                    "directory": vocab_dir
                }))

        assert "OOV token not found!" in str(excinfo.value)
    def test_interpret_fails_when_embedding_layer_not_found(self):
        inputs = {"sentence": "It was the ending that I hated"}
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace([w for w in inputs["sentence"].split(" ")])
        model = FakeModelForTestingInterpret(vocab, max_tokens=len(inputs["sentence"].split(" ")))
        predictor = TextClassifierPredictor(model, TextClassificationJsonReader())

        interpreter = SmoothGradient(predictor)
        with raises(RuntimeError):
            interpreter.saliency_interpret_from_json(inputs)
Exemplo n.º 5
0
    def test_invalid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1"])
        original_vocab.add_tokens_to_namespace(["a", "b"], namespace="tokens1")
        original_vocab.add_token_to_namespace("p", namespace="tokens2")
        original_vocab.save_to_files(vocab_dir)
        text_field1 = TextField([Token(t) for t in ["a", "c"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field2 = TextField([Token(t) for t in ["p", "q", "r"]],
                                {"tokens2": SingleIdTokenIndexer("tokens2")})
        instances = Batch(
            [Instance({
                "text1": text_field1,
                "text2": text_field2
            })])

        # Following 2 should give error: tokens1 is non-padded in original_vocab but not in instances
        params = Params({
            "type": "extend",
            "directory": vocab_dir,
            "non_padded_namespaces": [],
            "tokens_to_add": {
                "tokens1": ["a"],
                "tokens2": ["p"]
            },
        })
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances)

        # Following 2 should not give error: overlapping namespaces have same padding setting
        params = Params({
            "type": "extend",
            "directory": vocab_dir,
            "non_padded_namespaces": ["tokens1"],
            "tokens_to_add": {
                "tokens1": ["a"],
                "tokens2": ["p"]
            },
        })
        Vocabulary.from_params(params, instances=instances)

        # Following 2 should give error: tokens2 is padded in instances but not in original_vocab
        params = Params({
            "type": "extend",
            "directory": vocab_dir,
            "non_padded_namespaces": ["tokens1", "tokens2"],
            "tokens_to_add": {
                "tokens1": ["a"],
                "tokens2": ["p"]
            },
        })
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances)
Exemplo n.º 6
0
def create_vocab_decoder_net_and_criterion(decoder_input_dim,
                                           symbols=["A", "B"]):
    vocab = Vocabulary()
    vocab.add_tokens_to_namespace(symbols + [START_SYMBOL, END_SYMBOL])

    decoder_net = LstmCellDecoderNet(
        decoding_dim=decoder_input_dim,
        target_embedding_dim=decoder_input_dim,
    )

    loss_criterion = MaximumLikelihoodLossCriterion()

    return vocab, decoder_net, loss_criterion
Exemplo n.º 7
0
    def test_interpret_fails_when_embedding_layer_not_found(self):
        inputs = {"sentence": "I always write unit tests for my code."}
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(
            [w for w in inputs["sentence"].split(" ")])
        model = FakeModelForTestingInterpret(
            vocab, max_tokens=len(inputs["sentence"].split(" ")))
        predictor = TextClassifierPredictor(model,
                                            TextClassificationJsonReader())

        hotflipper = Hotflip(predictor)
        with raises(RuntimeError):
            hotflipper.initialize()
def create_vocab_and_decoder_net(decoder_inout_dim):
    vocab = Vocabulary()
    vocab.add_tokens_to_namespace(["A", "B", START_SYMBOL, END_SYMBOL])

    decoder_net = StackedSelfAttentionDecoderNet(
        decoding_dim=decoder_inout_dim,
        target_embedding_dim=decoder_inout_dim,
        feedforward_hidden_dim=20,
        num_layers=2,
        num_attention_heads=4,
    )

    return vocab, decoder_net
    def test_vanilla_text_to_instance(self):
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(
            [
                'This',
                'is',
                'not',
                'a',
                'difficult',
                'test'
            ],
            namespace='tokens'
        )
        reader = PrefixReader()
        instance = reader.text_to_instance(
            prefix_a='This is a',
            prefix_b='This is not a',
            suffix='difficult test'
        )
        instance.index_fields(vocab)
        tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths())

        tokens_a = instance['tokens_a']
        self.assertListEqual(
            [t.text for t in tokens_a],
            ['This', 'is', 'a', 'difficult', 'test']
        )

        token_ids_a = tensor_dict['tokens_a']['tokens']['tokens']
        self.assertListEqual(token_ids_a.tolist(), [2, 3, 5, 6, 7])

        eval_mask_a = tensor_dict['eval_mask_a']
        self.assertListEqual(eval_mask_a.tolist(), [0, 0, 0, 1, 1])

        tokens_b = instance['tokens_b']
        self.assertListEqual(
            [t.text for t in tokens_b],
            ['This', 'is', 'not', 'a', 'difficult', 'test']
        )

        token_ids_b = tensor_dict['tokens_b']['tokens']['tokens']
        self.assertListEqual(token_ids_b.tolist(), [2, 3, 4, 5, 6, 7])

        eval_mask_b = tensor_dict['eval_mask_b']
        self.assertListEqual(eval_mask_b.tolist(), [0, 0, 0, 0, 1, 1])

        metadata = tensor_dict['metadata']
        self.assertListEqual(metadata['prefix_a'],  ['This', 'is', 'a'])
        self.assertListEqual(metadata['prefix_b'],  ['This', 'is', 'not', 'a'])
        self.assertListEqual(metadata['suffix'],  ['difficult', 'test'])
Exemplo n.º 10
0
    def test_interpret_works_with_custom_embedding_layer(self):
        inputs = {"sentence": "It was the ending that I hated"}
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace([w for w in inputs["sentence"].split(" ")])
        model = FakeModelForTestingInterpret(vocab, max_tokens=len(inputs["sentence"].split(" ")))
        predictor = FakePredictorForTestingInterpret(model, TextClassificationJsonReader())
        interpreter = SmoothGradient(predictor)

        interpretation = interpreter.saliency_interpret_from_json(inputs)

        assert interpretation is not None
        assert "instance_1" in interpretation
        assert "grad_input_1" in interpretation["instance_1"]
        grad_input_1 = interpretation["instance_1"]["grad_input_1"]
        assert len(grad_input_1) == 7  # 7 words in input
Exemplo n.º 11
0
    def test_saving_and_loading(self):

        vocab_dir = self.TEST_DIR / "vocab_save"

        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_tokens_to_namespace(
            ["a0", "a1", "a2"], namespace="a")  # non-padded, should start at 0
        vocab.add_tokens_to_namespace(
            ["b2", "b3"], namespace="b")  # padded, should start at 2

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == {"a", "c"}

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace="a") == 3
        assert vocab2.get_token_from_index(0, namespace="a") == "a0"
        assert vocab2.get_token_from_index(1, namespace="a") == "a1"
        assert vocab2.get_token_from_index(2, namespace="a") == "a2"
        assert vocab2.get_token_index("a0", namespace="a") == 0
        assert vocab2.get_token_index("a1", namespace="a") == 1
        assert vocab2.get_token_index("a2", namespace="a") == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(
            namespace="b") == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(
            0, namespace="b") == vocab._padding_token
        assert vocab2.get_token_from_index(1,
                                           namespace="b") == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace="b") == "b2"
        assert vocab2.get_token_from_index(3, namespace="b") == "b3"
        assert vocab2.get_token_index(vocab._padding_token, namespace="b") == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace="b") == 1
        assert vocab2.get_token_index("b2", namespace="b") == 2
        assert vocab2.get_token_index("b3", namespace="b") == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary(
            "a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary(
            "b") == vocab2.get_index_to_token_vocabulary("b")
Exemplo n.º 12
0
    def test_interpret_works_with_custom_embedding_layer(self):
        inputs = {"sentence": "I always write unit tests for my code"}
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(
            [w for w in inputs["sentence"].split(" ")])
        model = FakeModelForTestingInterpret(
            vocab, max_tokens=len(inputs["sentence"].split(" ")))
        predictor = FakePredictorForTestingInterpret(
            model, TextClassificationJsonReader())

        hotflipper = Hotflip(predictor)
        hotflipper.initialize()
        attack = hotflipper.attack_from_json(inputs, "tokens", "grad_input_1")
        assert attack is not None
        assert "final" in attack
        assert "original" in attack
        assert "outputs" in attack
        assert len(attack["final"][0]) == len(
            attack["original"])  # hotflip replaces words without removing
Exemplo n.º 13
0
 def test_vocab_can_print(self):
     vocab = Vocabulary(non_padded_namespaces=["a", "c"])
     vocab.add_tokens_to_namespace(["a0", "a1", "a2"], namespace="a")
     vocab.add_tokens_to_namespace(["b2", "b3"], namespace="b")
     print(vocab)
Exemplo n.º 14
0
    def test_valid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / "vocab_save"
        # Test: padded/non-padded common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(
                non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_tokens_to_namespace(["d", "a", "b"],
                                                   namespace="tokens")
            text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]],
                                   {"tokens": SingleIdTokenIndexer("tokens")})
            vocab_dir = self.TEST_DIR / "vocab_save"
            shutil.rmtree(vocab_dir, ignore_errors=True)
            original_vocab.save_to_files(vocab_dir)
            instances = Batch([Instance({"text": text_field})])
            params = Params({
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": non_padded_namespaces,
            })
            extended_vocab = Vocabulary.from_params(params,
                                                    instances=instances)

            extra_count = 2 if extended_vocab.is_padded("tokens") else 0
            assert extended_vocab.get_token_index("d",
                                                  "tokens") == 0 + extra_count
            assert extended_vocab.get_token_index("a",
                                                  "tokens") == 1 + extra_count
            assert extended_vocab.get_token_index("b",
                                                  "tokens") == 2 + extra_count

            assert extended_vocab.get_token_index(
                "c", "tokens")  # should be present
            assert extended_vocab.get_token_index(
                "e", "tokens")  # should be present

            assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count

        # Test: padded/non-padded non-common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(
                non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace(
                "a", namespace="tokens1")  # index2
            text_field = TextField(
                [Token(t) for t in ["b"]],
                {"tokens2": SingleIdTokenIndexer("tokens2")})
            instances = Batch([Instance({"text": text_field})])
            vocab_dir = self.TEST_DIR / "vocab_save"
            shutil.rmtree(vocab_dir, ignore_errors=True)
            original_vocab.save_to_files(vocab_dir)

            params = Params({
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": non_padded_namespaces,
            })
            extended_vocab = Vocabulary.from_params(params,
                                                    instances=instances)

            # Should have two namespaces
            assert len(extended_vocab._token_to_index) == 2

            extra_count = 2 if extended_vocab.is_padded("tokens1") else 0
            assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count

            extra_count = 2 if extended_vocab.is_padded("tokens2") else 0
            assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
Exemplo n.º 15
0
def train_lstm(train_dataset,
               batch_size,
               num_layers,
               use_elmo=False,
               epochs=15,
               bidirectional=True,
               learning_rate=3e-4,
               hidden_size=64,
               num_classes=2,
               use_gpu=False):
    """
    Trains a LSTM and its variants (Vanilla, Bi-Directional, Stacked BiLSTM) on train_dataset. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings.


    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    batch_size: int
        number of Instances to process in a batch
    num_layers: int
        number of BiLSTM layers: 2 or higher for Stacked BiLSTMs
    use_elmo: bool
        use elmo embeddings (transfer learning) if True | GloVe if False
    epochs: int
        total number of epochs to train on (default=30)
    bidirectional: bool
        True for a bidirectional LSTM
    learning_rate: float
        learning rate for Adam Optimizer
    hidden_size: int
        size of the hidden layer in the encoder
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    if use_elmo:
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(tokens=['fic', 'non'],
                                      namespace="labels")
        word_embeddings: TextFieldEmbedder = load_elmo_embeddings()
    else:
        vocab = Vocabulary.from_instances(train_dataset)
        word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab)

    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)  # numericalize the data

    assert vocab.get_token_from_index(index=0, namespace='labels') == 'fic'
    assert vocab.get_token_from_index(index=1, namespace='labels') == 'non'
    print("\n\nThe ordering of labels is ['fic', 'non']\n\n")

    encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(
        nn.LSTM(word_embeddings.get_output_dim(),
                hidden_size,
                num_layers=num_layers,
                bidirectional=bidirectional,
                batch_first=True))

    classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(),
                                                    num_classes)
    model = models.Classifier(vocab=vocab,
                              word_embeddings=word_embeddings,
                              encoder=encoder,
                              classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      cuda_device=0 if use_gpu else -1,
                      num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
Exemplo n.º 16
0
def train_cnn(train_dataset,
              batch_size,
              num_filters,
              filter_sizes,
              use_elmo=False,
              epochs=15,
              learning_rate=3e-4,
              num_classes=2,
              use_gpu=False):
    """
    Trains CNN on train_dataset. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings.
    The CNN has one convolution layer for each ngram filter size.

    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    batch_size: int
        number of Instances to process in a batch
    num_filters: int
        output dim for each convolutional layer, which is the number of 'filters' learned by that layer
    filter_sizes: Tuple[int]
        specifies the number of convolutional layers and their sizes
    use_elmo: bool
        use ELMo embeddings (transfer learning) if True | GloVe if False
    epochs: int
        total number of epochs to train on (default=30)
    learning_rate: float
        learning rate for Adam Optimizer
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    if use_elmo:
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(tokens=['fic', 'non'],
                                      namespace="labels")
        word_embeddings: TextFieldEmbedder = load_elmo_embeddings()
    else:
        vocab = Vocabulary.from_instances(train_dataset)
        word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab)

    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)  # numericalize the data

    assert vocab.get_token_from_index(index=0, namespace='labels') == 'fic'
    assert vocab.get_token_from_index(index=1, namespace='labels') == 'non'
    print("\n\nThe ordering of labels is ['fic', 'non']\n\n")

    encoder: Seq2VecEncoder = CnnEncoder(
        embedding_dim=word_embeddings.get_output_dim(),
        num_filters=num_filters,
        ngram_filter_sizes=filter_sizes)

    classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(),
                                                    num_classes)
    model = models.Classifier(vocab=vocab,
                              word_embeddings=word_embeddings,
                              encoder=encoder,
                              classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      cuda_device=0 if use_gpu else -1,
                      num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
Exemplo n.º 17
0
def main():
	###############################################################################################
	prepare_global_logging(serialization_dir=args.serialization_dir, file_friendly_logging=False)
	#DATA
	reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(),
	                        target_tokenizer=CharacterTokenizer(),
	                        source_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')},
	                        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')},
	                        target=False,
	                        label=True,
	                        lazy=False)
	# train_data = reader.read("../../datasets/math/label-data/train-all")
	# val_data = reader.read("../../datasets/math/label-data/interpolate")
	val_data = reader.read("./generate_files")


	vocab = Vocabulary()
	vocab.add_tokens_to_namespace([START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-', '.', '/',
	                                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?',
	                                    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
	                                    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b',
	                                    'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
	                                    'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}'], namespace='tokens')
	vocab.add_tokens_to_namespace(['algebra', 'arithmetic', 'calculus', 'comparison',
	  								 'measurement', 'numbers', 'polynomials', 'probability'], namespace='labels')



	# MODEL
	embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
	                             embedding_dim=EMBEDDING_DIM)
	source_embedder = BasicTextFieldEmbedder({"tokens": embedding})

	if args.model == 'lstm':
		encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, 
											num_layers=NUM_LAYERS, batch_first=True))
	elif args.model == 'cnn':
		encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, output_dim=HIDDEN_DIM)
	else:
		raise NotImplemented("The classifier model should be LSTM or CNN")


	model = TextClassifier(vocab=vocab,
				source_text_embedder=source_embedder,
	            encoder=encoder,
	            )
	model.to(device)


	if not Path(args.serialization_dir).exists() or not Path(args.serialization_dir).is_dir():
  		raise NotImplementedError("The model seems not to exist")
	with open(Path(args.serialization_dir) / "best.th", "rb") as model_path:
  		model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1))
  		model.load_state_dict(model_state)
	model.eval()

	predictor = TextClassifierPredictor(model, dataset_reader=reader)

	# TEST
	correct = 0
	total = 0

	pbar = tqdm(val_data)
	batch_instance = list()
	batch_gt = list()

	idx_last = 0
	for idx, instance in enumerate(pbar):
		if idx != (idx_last + BATCH_SIZE):
			batch_instance.append(instance)
			batch_gt.append(instance.fields["labels"].label) # str
		else:
			idx_last = idx
			outputs = predictor.predict(batch_instance)
			for i, output in enumerate(outputs):
				if batch_gt[i] == output['predict_labels']:
					correct += 1
				total += 1
			batch_instance = list()
			batch_gt = list()
			pbar.set_description("correct/total %.3f" % (correct / total))
Exemplo n.º 18
0
def main():
    ###############################################################################################
    prepare_global_logging(serialization_dir=args.serialization_dir,
                           file_friendly_logging=False)
    #DATA
    reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(),
                               target_tokenizer=CharacterTokenizer(),
                               source_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target=False,
                               label=True,
                               lazy=True)
    train_data = reader.read("../../datasets/math/label-data/train-all")
    # val_data = reader.read("../../datasets/math/label-data/interpolate")

    vocab = Vocabulary()
    vocab.add_tokens_to_namespace([
        START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-',
        '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<',
        '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y',
        'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
        'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{',
        '}'
    ],
                                  namespace='tokens')
    vocab.add_tokens_to_namespace([
        'algebra', 'arithmetic', 'calculus', 'comparison', 'measurement',
        'numbers', 'polynomials', 'probability'
    ],
                                  namespace='labels')

    # MODEL
    embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                          embedding_dim=EMBEDDING_DIM)
    source_embedder = BasicTextFieldEmbedder({"tokens": embedding})

    if args.model == 'lstm':
        encoder = PytorchSeq2VecWrapper(
            torch.nn.LSTM(EMBEDDING_DIM,
                          HIDDEN_DIM,
                          num_layers=NUM_LAYERS,
                          batch_first=True))
    elif args.model == 'cnn':
        encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM,
                             num_filters=NUM_FILTERS,
                             output_dim=HIDDEN_DIM)
    else:
        raise NotImplemented("The classifier model should be LSTM or CNN")

    model = TextClassifier(
        vocab=vocab,
        source_text_embedder=source_embedder,
        encoder=encoder,
    )
    model.to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=1e-3,
                           betas=(0.9, 0.995),
                           eps=1e-6)

    train_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                    max_instances_in_memory=1024,
                                    sorting_keys=[("source_tokens",
                                                   "num_tokens")])
    train_iterator = MultiprocessIterator(train_iterator, num_workers=16)
    train_iterator.index_with(vocab)

    val_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                  max_instances_in_memory=1024,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
    val_iterator = MultiprocessIterator(val_iterator, num_workers=16)
    val_iterator.index_with(vocab)
    #pdb.set_trace()

    LR_SCHEDULER = {"type": "exponential", "gamma": 0.5, "last_epoch": -1}
    lr_scheduler = LearningRateScheduler.from_params(optimizer,
                                                     Params(LR_SCHEDULER))

    # TRAIN
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=train_iterator,
                      validation_iterator=None,
                      train_dataset=train_data,
                      validation_dataset=None,
                      patience=None,
                      shuffle=True,
                      num_epochs=1,
                      summary_interval=100,
                      learning_rate_scheduler=lr_scheduler,
                      cuda_device=CUDA_DEVICES,
                      grad_norm=5,
                      grad_clipping=5,
                      model_save_interval=600,
                      serialization_dir=args.serialization_dir,
                      keep_serialized_model_every_num_seconds=3600,
                      should_log_parameter_statistics=True,
                      should_log_learning_rate=True)
    trainer.train()
    def test_rollin_rollout_decoder_init(self):
        decoder_input_dim = 4

        # Test you can build BaseRollinRolloutDecoder object.
        build_decoder(decoder_input_dim)

        # Test init. raise error when decoder input annd embedding dim
        # are not same.
        with pytest.raises(ConfigurationError):
            vocab, _, _ = create_vocab_decoder_net_and_criterion(
                decoder_input_dim)
            embedder = Embedding(num_embeddings=vocab.get_vocab_size(),
                                 embedding_dim=decoder_input_dim + 1)
            build_decoder(decoder_input_dim, embedder)

        # Test init. raises error when embedding are tied and
        # projection layers size is not same as
        # embedding layer (transposed).

        # Test embeding hidden dim should be same as decoder output dim (
        # or output projection layer's input dim)
        with pytest.raises(ConfigurationError):
            vocab = Vocabulary()
            vocab.add_tokens_to_namespace(["A", "B", START_SYMBOL, END_SYMBOL])

            decoder_net = LstmCellDecoderNet(
                decoding_dim=decoder_input_dim,
                target_embedding_dim=decoder_input_dim + 1,
            )

            loss_criterion = MaximumLikelihoodLossCriterion()

            embedder = Embedding(num_embeddings=vocab.get_vocab_size(),
                                 embedding_dim=decoder_input_dim)

            BaseRollinRolloutDecoder(vocab,
                                     10,
                                     decoder_net,
                                     embedder,
                                     loss_criterion,
                                     tie_output_embedding=True)

        # Test output projection layers output dim is same as vocab (or embed's input dim)
        with pytest.raises(ConfigurationError):
            vocab = Vocabulary()
            vocab.add_tokens_to_namespace(["A", "B", START_SYMBOL, END_SYMBOL])

            decoder_net = LstmCellDecoderNet(
                decoding_dim=decoder_input_dim,
                target_embedding_dim=decoder_input_dim,
            )

            loss_criterion = MaximumLikelihoodLossCriterion()

            embedder = Embedding(num_embeddings=vocab.get_vocab_size() + 1,
                                 embedding_dim=decoder_input_dim)

            BaseRollinRolloutDecoder(vocab,
                                     10,
                                     decoder_net,
                                     embedder,
                                     loss_criterion,
                                     tie_output_embedding=True)