def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / "vocab_save" vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_tokens_to_namespace( ["a0", "a1", "a2"], namespace="a" ) # non-padded, should start at 0 vocab.add_tokens_to_namespace(["b2", "b3"], namespace="b") # padded, should start at 2 vocab.save_to_files(vocab_dir) params = Params({"type": "from_files", "directory": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), instances=self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == { 0: "@@PADDING@@", 1: "@@UNKNOWN@@", 2: "a", 3: "c", 4: "b", } # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'directory' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params( Params({"type": "from_files", "directory": vocab_dir, "min_count": {"tokens": 2}}) )
def test_extend_from_vocab(self): vocab1 = Vocabulary(non_padded_namespaces={"1", "2"}) vocab2 = Vocabulary(non_padded_namespaces={"3"}) vocab1.add_tokens_to_namespace(["a", "b", "c"], namespace="1") vocab1.add_tokens_to_namespace(["d", "e", "f"], namespace="2") vocab2.add_tokens_to_namespace(["c", "d", "e"], namespace="1") vocab2.add_tokens_to_namespace(["g", "h", "i"], namespace="3") vocab1.extend_from_vocab(vocab2) assert vocab1.get_namespaces() == {"1", "2", "3"} assert vocab1._non_padded_namespaces == {"1", "2", "3"} assert vocab1.get_token_to_index_vocabulary("1") == { "a": 0, "b": 1, "c": 2, "@@PADDING@@": 3, "@@UNKNOWN@@": 4, "d": 5, "e": 6, } assert vocab1.get_token_to_index_vocabulary("2") == { "d": 0, "e": 1, "f": 2, } assert vocab1.get_token_to_index_vocabulary("3") == { "g": 0, "h": 1, "i": 2, }
def test_custom_padding_oov_tokens(self): vocab = Vocabulary(oov_token="[UNK]") assert vocab._oov_token == "[UNK]" assert vocab._padding_token == "@@PADDING@@" vocab = Vocabulary(padding_token="[PAD]") assert vocab._oov_token == "@@UNKNOWN@@" assert vocab._padding_token == "[PAD]" vocab_dir = self.TEST_DIR / "vocab_save" vocab = Vocabulary(oov_token="<UNK>") vocab.add_tokens_to_namespace(["a0", "a1", "a2"], namespace="a") vocab.save_to_files(vocab_dir) params = Params({ "type": "from_files", "directory": vocab_dir, "oov_token": "<UNK>" }) vocab = Vocabulary.from_params(params) with pytest.raises(AssertionError) as excinfo: vocab = Vocabulary.from_params( Params({ "type": "from_files", "directory": vocab_dir })) assert "OOV token not found!" in str(excinfo.value)
def test_interpret_fails_when_embedding_layer_not_found(self): inputs = {"sentence": "It was the ending that I hated"} vocab = Vocabulary() vocab.add_tokens_to_namespace([w for w in inputs["sentence"].split(" ")]) model = FakeModelForTestingInterpret(vocab, max_tokens=len(inputs["sentence"].split(" "))) predictor = TextClassifierPredictor(model, TextClassificationJsonReader()) interpreter = SmoothGradient(predictor) with raises(RuntimeError): interpreter.saliency_interpret_from_json(inputs)
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_tokens_to_namespace(["a", "b"], namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField([Token(t) for t in ["a", "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field2 = TextField([Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch( [Instance({ "text1": text_field1, "text2": text_field2 })]) # Following 2 should give error: tokens1 is non-padded in original_vocab but not in instances params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": [], "tokens_to_add": { "tokens1": ["a"], "tokens2": ["p"] }, }) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": ["tokens1"], "tokens_to_add": { "tokens1": ["a"], "tokens2": ["p"] }, }) Vocabulary.from_params(params, instances=instances) # Following 2 should give error: tokens2 is padded in instances but not in original_vocab params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": ["tokens1", "tokens2"], "tokens_to_add": { "tokens1": ["a"], "tokens2": ["p"] }, }) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances)
def create_vocab_decoder_net_and_criterion(decoder_input_dim, symbols=["A", "B"]): vocab = Vocabulary() vocab.add_tokens_to_namespace(symbols + [START_SYMBOL, END_SYMBOL]) decoder_net = LstmCellDecoderNet( decoding_dim=decoder_input_dim, target_embedding_dim=decoder_input_dim, ) loss_criterion = MaximumLikelihoodLossCriterion() return vocab, decoder_net, loss_criterion
def test_interpret_fails_when_embedding_layer_not_found(self): inputs = {"sentence": "I always write unit tests for my code."} vocab = Vocabulary() vocab.add_tokens_to_namespace( [w for w in inputs["sentence"].split(" ")]) model = FakeModelForTestingInterpret( vocab, max_tokens=len(inputs["sentence"].split(" "))) predictor = TextClassifierPredictor(model, TextClassificationJsonReader()) hotflipper = Hotflip(predictor) with raises(RuntimeError): hotflipper.initialize()
def create_vocab_and_decoder_net(decoder_inout_dim): vocab = Vocabulary() vocab.add_tokens_to_namespace(["A", "B", START_SYMBOL, END_SYMBOL]) decoder_net = StackedSelfAttentionDecoderNet( decoding_dim=decoder_inout_dim, target_embedding_dim=decoder_inout_dim, feedforward_hidden_dim=20, num_layers=2, num_attention_heads=4, ) return vocab, decoder_net
def test_vanilla_text_to_instance(self): vocab = Vocabulary() vocab.add_tokens_to_namespace( [ 'This', 'is', 'not', 'a', 'difficult', 'test' ], namespace='tokens' ) reader = PrefixReader() instance = reader.text_to_instance( prefix_a='This is a', prefix_b='This is not a', suffix='difficult test' ) instance.index_fields(vocab) tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths()) tokens_a = instance['tokens_a'] self.assertListEqual( [t.text for t in tokens_a], ['This', 'is', 'a', 'difficult', 'test'] ) token_ids_a = tensor_dict['tokens_a']['tokens']['tokens'] self.assertListEqual(token_ids_a.tolist(), [2, 3, 5, 6, 7]) eval_mask_a = tensor_dict['eval_mask_a'] self.assertListEqual(eval_mask_a.tolist(), [0, 0, 0, 1, 1]) tokens_b = instance['tokens_b'] self.assertListEqual( [t.text for t in tokens_b], ['This', 'is', 'not', 'a', 'difficult', 'test'] ) token_ids_b = tensor_dict['tokens_b']['tokens']['tokens'] self.assertListEqual(token_ids_b.tolist(), [2, 3, 4, 5, 6, 7]) eval_mask_b = tensor_dict['eval_mask_b'] self.assertListEqual(eval_mask_b.tolist(), [0, 0, 0, 0, 1, 1]) metadata = tensor_dict['metadata'] self.assertListEqual(metadata['prefix_a'], ['This', 'is', 'a']) self.assertListEqual(metadata['prefix_b'], ['This', 'is', 'not', 'a']) self.assertListEqual(metadata['suffix'], ['difficult', 'test'])
def test_interpret_works_with_custom_embedding_layer(self): inputs = {"sentence": "It was the ending that I hated"} vocab = Vocabulary() vocab.add_tokens_to_namespace([w for w in inputs["sentence"].split(" ")]) model = FakeModelForTestingInterpret(vocab, max_tokens=len(inputs["sentence"].split(" "))) predictor = FakePredictorForTestingInterpret(model, TextClassificationJsonReader()) interpreter = SmoothGradient(predictor) interpretation = interpreter.saliency_interpret_from_json(inputs) assert interpretation is not None assert "instance_1" in interpretation assert "grad_input_1" in interpretation["instance_1"] grad_input_1 = interpretation["instance_1"]["grad_input_1"] assert len(grad_input_1) == 7 # 7 words in input
def test_saving_and_loading(self): vocab_dir = self.TEST_DIR / "vocab_save" vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_tokens_to_namespace( ["a0", "a1", "a2"], namespace="a") # non-padded, should start at 0 vocab.add_tokens_to_namespace( ["b2", "b3"], namespace="b") # padded, should start at 2 vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == {"a", "c"} # Check namespace a. assert vocab2.get_vocab_size(namespace="a") == 3 assert vocab2.get_token_from_index(0, namespace="a") == "a0" assert vocab2.get_token_from_index(1, namespace="a") == "a1" assert vocab2.get_token_from_index(2, namespace="a") == "a2" assert vocab2.get_token_index("a0", namespace="a") == 0 assert vocab2.get_token_index("a1", namespace="a") == 1 assert vocab2.get_token_index("a2", namespace="a") == 2 # Check namespace b. assert vocab2.get_vocab_size( namespace="b") == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index( 0, namespace="b") == vocab._padding_token assert vocab2.get_token_from_index(1, namespace="b") == vocab._oov_token assert vocab2.get_token_from_index(2, namespace="b") == "b2" assert vocab2.get_token_from_index(3, namespace="b") == "b3" assert vocab2.get_token_index(vocab._padding_token, namespace="b") == 0 assert vocab2.get_token_index(vocab._oov_token, namespace="b") == 1 assert vocab2.get_token_index("b2", namespace="b") == 2 assert vocab2.get_token_index("b3", namespace="b") == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary( "a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary( "b") == vocab2.get_index_to_token_vocabulary("b")
def test_interpret_works_with_custom_embedding_layer(self): inputs = {"sentence": "I always write unit tests for my code"} vocab = Vocabulary() vocab.add_tokens_to_namespace( [w for w in inputs["sentence"].split(" ")]) model = FakeModelForTestingInterpret( vocab, max_tokens=len(inputs["sentence"].split(" "))) predictor = FakePredictorForTestingInterpret( model, TextClassificationJsonReader()) hotflipper = Hotflip(predictor) hotflipper.initialize() attack = hotflipper.attack_from_json(inputs, "tokens", "grad_input_1") assert attack is not None assert "final" in attack assert "original" in attack assert "outputs" in attack assert len(attack["final"][0]) == len( attack["original"]) # hotflip replaces words without removing
def test_vocab_can_print(self): vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_tokens_to_namespace(["a0", "a1", "a2"], namespace="a") vocab.add_tokens_to_namespace(["b2", "b3"], namespace="b") print(vocab)
def test_valid_vocab_extension(self): vocab_dir = self.TEST_DIR / "vocab_save" # Test: padded/non-padded common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary( non_padded_namespaces=non_padded_namespaces) original_vocab.add_tokens_to_namespace(["d", "a", "b"], namespace="tokens") text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]], {"tokens": SingleIdTokenIndexer("tokens")}) vocab_dir = self.TEST_DIR / "vocab_save" shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) instances = Batch([Instance({"text": text_field})]) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": non_padded_namespaces, }) extended_vocab = Vocabulary.from_params(params, instances=instances) extra_count = 2 if extended_vocab.is_padded("tokens") else 0 assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count assert extended_vocab.get_token_index( "c", "tokens") # should be present assert extended_vocab.get_token_index( "e", "tokens") # should be present assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count # Test: padded/non-padded non-common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary( non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace( "a", namespace="tokens1") # index2 text_field = TextField( [Token(t) for t in ["b"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text": text_field})]) vocab_dir = self.TEST_DIR / "vocab_save" shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": non_padded_namespaces, }) extended_vocab = Vocabulary.from_params(params, instances=instances) # Should have two namespaces assert len(extended_vocab._token_to_index) == 2 extra_count = 2 if extended_vocab.is_padded("tokens1") else 0 assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count extra_count = 2 if extended_vocab.is_padded("tokens2") else 0 assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
def train_lstm(train_dataset, batch_size, num_layers, use_elmo=False, epochs=15, bidirectional=True, learning_rate=3e-4, hidden_size=64, num_classes=2, use_gpu=False): """ Trains a LSTM and its variants (Vanilla, Bi-Directional, Stacked BiLSTM) on train_dataset. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings. Parameters ---------- train_dataset: List[Instance] Instances for training set batch_size: int number of Instances to process in a batch num_layers: int number of BiLSTM layers: 2 or higher for Stacked BiLSTMs use_elmo: bool use elmo embeddings (transfer learning) if True | GloVe if False epochs: int total number of epochs to train on (default=30) bidirectional: bool True for a bidirectional LSTM learning_rate: float learning rate for Adam Optimizer hidden_size: int size of the hidden layer in the encoder num_classes: int default=2 for binary classification use_gpu: bool True to use the GPU Returns ------- Trained Model, Vocabulary, Number of actual training epochs """ if use_elmo: vocab = Vocabulary() vocab.add_tokens_to_namespace(tokens=['fic', 'non'], namespace="labels") word_embeddings: TextFieldEmbedder = load_elmo_embeddings() else: vocab = Vocabulary.from_instances(train_dataset) word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab) iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # numericalize the data assert vocab.get_token_from_index(index=0, namespace='labels') == 'fic' assert vocab.get_token_from_index(index=1, namespace='labels') == 'non' print("\n\nThe ordering of labels is ['fic', 'non']\n\n") encoder: Seq2VecEncoder = PytorchSeq2VecWrapper( nn.LSTM(word_embeddings.get_output_dim(), hidden_size, num_layers=num_layers, bidirectional=bidirectional, batch_first=True)) classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes) model = models.Classifier(vocab=vocab, word_embeddings=word_embeddings, encoder=encoder, classifier_feedforward=classifier_feedforward) if use_gpu: model.cuda() else: model optimizer = optim.Adam(model.parameters(), learning_rate) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, cuda_device=0 if use_gpu else -1, num_epochs=epochs) metrics = trainer.train() print(metrics) return model, vocab, metrics['training_epochs']
def train_cnn(train_dataset, batch_size, num_filters, filter_sizes, use_elmo=False, epochs=15, learning_rate=3e-4, num_classes=2, use_gpu=False): """ Trains CNN on train_dataset. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings. The CNN has one convolution layer for each ngram filter size. Parameters ---------- train_dataset: List[Instance] Instances for training set batch_size: int number of Instances to process in a batch num_filters: int output dim for each convolutional layer, which is the number of 'filters' learned by that layer filter_sizes: Tuple[int] specifies the number of convolutional layers and their sizes use_elmo: bool use ELMo embeddings (transfer learning) if True | GloVe if False epochs: int total number of epochs to train on (default=30) learning_rate: float learning rate for Adam Optimizer num_classes: int default=2 for binary classification use_gpu: bool True to use the GPU Returns ------- Trained Model, Vocabulary, Number of actual training epochs """ if use_elmo: vocab = Vocabulary() vocab.add_tokens_to_namespace(tokens=['fic', 'non'], namespace="labels") word_embeddings: TextFieldEmbedder = load_elmo_embeddings() else: vocab = Vocabulary.from_instances(train_dataset) word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab) iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # numericalize the data assert vocab.get_token_from_index(index=0, namespace='labels') == 'fic' assert vocab.get_token_from_index(index=1, namespace='labels') == 'non' print("\n\nThe ordering of labels is ['fic', 'non']\n\n") encoder: Seq2VecEncoder = CnnEncoder( embedding_dim=word_embeddings.get_output_dim(), num_filters=num_filters, ngram_filter_sizes=filter_sizes) classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes) model = models.Classifier(vocab=vocab, word_embeddings=word_embeddings, encoder=encoder, classifier_feedforward=classifier_feedforward) if use_gpu: model.cuda() else: model optimizer = optim.Adam(model.parameters(), learning_rate) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, cuda_device=0 if use_gpu else -1, num_epochs=epochs) metrics = trainer.train() print(metrics) return model, vocab, metrics['training_epochs']
def main(): ############################################################################################### prepare_global_logging(serialization_dir=args.serialization_dir, file_friendly_logging=False) #DATA reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')}, target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')}, target=False, label=True, lazy=False) # train_data = reader.read("../../datasets/math/label-data/train-all") # val_data = reader.read("../../datasets/math/label-data/interpolate") val_data = reader.read("./generate_files") vocab = Vocabulary() vocab.add_tokens_to_namespace([START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}'], namespace='tokens') vocab.add_tokens_to_namespace(['algebra', 'arithmetic', 'calculus', 'comparison', 'measurement', 'numbers', 'polynomials', 'probability'], namespace='labels') # MODEL embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) source_embedder = BasicTextFieldEmbedder({"tokens": embedding}) if args.model == 'lstm': encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, num_layers=NUM_LAYERS, batch_first=True)) elif args.model == 'cnn': encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, output_dim=HIDDEN_DIM) else: raise NotImplemented("The classifier model should be LSTM or CNN") model = TextClassifier(vocab=vocab, source_text_embedder=source_embedder, encoder=encoder, ) model.to(device) if not Path(args.serialization_dir).exists() or not Path(args.serialization_dir).is_dir(): raise NotImplementedError("The model seems not to exist") with open(Path(args.serialization_dir) / "best.th", "rb") as model_path: model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1)) model.load_state_dict(model_state) model.eval() predictor = TextClassifierPredictor(model, dataset_reader=reader) # TEST correct = 0 total = 0 pbar = tqdm(val_data) batch_instance = list() batch_gt = list() idx_last = 0 for idx, instance in enumerate(pbar): if idx != (idx_last + BATCH_SIZE): batch_instance.append(instance) batch_gt.append(instance.fields["labels"].label) # str else: idx_last = idx outputs = predictor.predict(batch_instance) for i, output in enumerate(outputs): if batch_gt[i] == output['predict_labels']: correct += 1 total += 1 batch_instance = list() batch_gt = list() pbar.set_description("correct/total %.3f" % (correct / total))
def main(): ############################################################################################### prepare_global_logging(serialization_dir=args.serialization_dir, file_friendly_logging=False) #DATA reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens') }, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens') }, target=False, label=True, lazy=True) train_data = reader.read("../../datasets/math/label-data/train-all") # val_data = reader.read("../../datasets/math/label-data/interpolate") vocab = Vocabulary() vocab.add_tokens_to_namespace([ START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}' ], namespace='tokens') vocab.add_tokens_to_namespace([ 'algebra', 'arithmetic', 'calculus', 'comparison', 'measurement', 'numbers', 'polynomials', 'probability' ], namespace='labels') # MODEL embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) source_embedder = BasicTextFieldEmbedder({"tokens": embedding}) if args.model == 'lstm': encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, num_layers=NUM_LAYERS, batch_first=True)) elif args.model == 'cnn': encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, output_dim=HIDDEN_DIM) else: raise NotImplemented("The classifier model should be LSTM or CNN") model = TextClassifier( vocab=vocab, source_text_embedder=source_embedder, encoder=encoder, ) model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.995), eps=1e-6) train_iterator = BucketIterator(batch_size=BATCH_SIZE, max_instances_in_memory=1024, sorting_keys=[("source_tokens", "num_tokens")]) train_iterator = MultiprocessIterator(train_iterator, num_workers=16) train_iterator.index_with(vocab) val_iterator = BucketIterator(batch_size=BATCH_SIZE, max_instances_in_memory=1024, sorting_keys=[("source_tokens", "num_tokens") ]) val_iterator = MultiprocessIterator(val_iterator, num_workers=16) val_iterator.index_with(vocab) #pdb.set_trace() LR_SCHEDULER = {"type": "exponential", "gamma": 0.5, "last_epoch": -1} lr_scheduler = LearningRateScheduler.from_params(optimizer, Params(LR_SCHEDULER)) # TRAIN trainer = Trainer(model=model, optimizer=optimizer, iterator=train_iterator, validation_iterator=None, train_dataset=train_data, validation_dataset=None, patience=None, shuffle=True, num_epochs=1, summary_interval=100, learning_rate_scheduler=lr_scheduler, cuda_device=CUDA_DEVICES, grad_norm=5, grad_clipping=5, model_save_interval=600, serialization_dir=args.serialization_dir, keep_serialized_model_every_num_seconds=3600, should_log_parameter_statistics=True, should_log_learning_rate=True) trainer.train()
def test_rollin_rollout_decoder_init(self): decoder_input_dim = 4 # Test you can build BaseRollinRolloutDecoder object. build_decoder(decoder_input_dim) # Test init. raise error when decoder input annd embedding dim # are not same. with pytest.raises(ConfigurationError): vocab, _, _ = create_vocab_decoder_net_and_criterion( decoder_input_dim) embedder = Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=decoder_input_dim + 1) build_decoder(decoder_input_dim, embedder) # Test init. raises error when embedding are tied and # projection layers size is not same as # embedding layer (transposed). # Test embeding hidden dim should be same as decoder output dim ( # or output projection layer's input dim) with pytest.raises(ConfigurationError): vocab = Vocabulary() vocab.add_tokens_to_namespace(["A", "B", START_SYMBOL, END_SYMBOL]) decoder_net = LstmCellDecoderNet( decoding_dim=decoder_input_dim, target_embedding_dim=decoder_input_dim + 1, ) loss_criterion = MaximumLikelihoodLossCriterion() embedder = Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=decoder_input_dim) BaseRollinRolloutDecoder(vocab, 10, decoder_net, embedder, loss_criterion, tie_output_embedding=True) # Test output projection layers output dim is same as vocab (or embed's input dim) with pytest.raises(ConfigurationError): vocab = Vocabulary() vocab.add_tokens_to_namespace(["A", "B", START_SYMBOL, END_SYMBOL]) decoder_net = LstmCellDecoderNet( decoding_dim=decoder_input_dim, target_embedding_dim=decoder_input_dim, ) loss_criterion = MaximumLikelihoodLossCriterion() embedder = Embedding(num_embeddings=vocab.get_vocab_size() + 1, embedding_dim=decoder_input_dim) BaseRollinRolloutDecoder(vocab, 10, decoder_net, embedder, loss_criterion, tie_output_embedding=True)