Exemplo n.º 1
0
 def from_params(self, params: Params) -> PytorchSeq2VecWrapper:
     if not params.pop('batch_first', True):
         raise ConfigurationError("Our encoder semantics assumes batch is always first!")
     if self._module_class in self.PYTORCH_MODELS:
         params['batch_first'] = True
     module = self._module_class(**params.as_dict())
     return PytorchSeq2VecWrapper(module)
Exemplo n.º 2
0
 def from_params(self, params: Params, **extras) -> PytorchSeq2VecWrapper:
     if not params.pop("batch_first", True):
         raise ConfigurationError(
             "Our encoder semantics assumes batch is always first!")
     if self._module_class in self.PYTORCH_MODELS:
         params["batch_first"] = True
     module = self._module_class(**params.as_dict(infer_type_and_cast=True))
     return PytorchSeq2VecWrapper(module)
Exemplo n.º 3
0
def train_on(dataset, params):
    print("Using hyperparameter configuration:", params)

    losses = []
    state_dicts = []
    kfold = StratifiedKFold(dataset, k=10, grouping=origin_of)

    for train, val in kfold:
        # TODO: Figure how much of the following code we can put outside the loop

        vocab = Vocabulary.from_instances(dataset)
        # TODO: Figure out the best parameters here
        elmo = Elmo(cached_path(OPTIONS_FILE),
                    cached_path(WEIGHTS_FILE),
                    num_output_representations=2,
                    dropout=params["dropout"]
                    )  # TODO: Does dropout refer to the LSTM or ELMo?
        word_embeddings = ELMoTextFieldEmbedder({"tokens": elmo})
        # TODO: Figure out the best parameters here
        lstm = PytorchSeq2VecWrapper(
            torch.nn.LSTM(input_size=elmo.get_output_dim(),
                          hidden_size=64,
                          num_layers=params["num_layers"],
                          batch_first=True))

        model = RuseModel(word_embeddings, lstm, vocab)
        optimizer = optim.Adam(model.parameters())
        # TODO: What kind of iterator should be used?
        iterator = BucketIterator(batch_size=params["batch_size"],
                                  sorting_keys=[("mt_sent", "num_tokens"),
                                                ("ref_sent", "num_tokens")])
        iterator.index_with(vocab)

        # TODO: Figure out best hyperparameters
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          iterator=iterator,
                          cuda_device=0,
                          train_dataset=train,
                          validation_dataset=val,
                          patience=5,
                          num_epochs=100)
        trainer.train()
        # TODO: Better way to access the validation loss?
        loss, _ = trainer._validation_loss()
        losses.append(loss)
        state_dicts.append(model.state_dict())

    mean_loss = np.mean(losses)
    print("Mean validation loss was:", mean_loss)

    return TrainResults(cv_loss=mean_loss, state_dicts=state_dicts)
Exemplo n.º 4
0
def main ():
	#Initlizing the embeddings (ELMO)
	elmo_token_indexer = ELMoTokenCharactersIndexer()

	reader = AnalogyDatasetReader(token_indexers={'tokens':elmo_token_indexer})

	train_dataset, test_dataset, dev_dataset = (reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all.txt", "test_all.txt", "val_all.txt"])

	# elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5)
	elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
	
	vocab = Vocabulary.from_instances(train_dataset + test_dataset + dev_dataset)
	word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder})
	#Initializing the model
	#takes the hidden state at the last time step of the LSTM for every layer as one single output
	lstm_encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True, bidirectional=True))
	model = LstmModel(word_embeddings, lstm_encoder, vocab)

	if USE_GPU: model.cuda()
	else: model

	# Training the model 
	optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
	iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
	iterator.index_with(vocab)

	trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=dev_dataset,
                  patience=10,
                  cuda_device=0 if USE_GPU else -1,
                  num_epochs=20)

	trainer.train()

	#Saving the model
	with open("model.th", 'wb') as f:
		torch.save(model.state_dict(), f)

	vocab.save_to_files("vocabulary")
Exemplo n.º 5
0
def _load_embedder(config, vocab, bert_max_length):
    embedders = {}
    for embedder_config in config.embedder.models:
        if embedder_config.name == 'elmo':
            embedders[embedder_config.name] = ElmoTokenEmbedder(
                options_file=os.path.join(config.data.pretrained_models_dir,
                                          'elmo/options.json'),
                weight_file=os.path.join(config.data.pretrained_models_dir,
                                         'elmo/model.hdf5'),
                requires_grad=embedder_config.params['requires_grad'],
                dropout=0.)
            embedders[embedder_config.name].eval()
        elif embedder_config.name.endswith('bert'):
            embedders[
                embedder_config.
                name] = PretrainedTransformerMismatchedEmbedder(
                    model_name=os.path.join(config.data.pretrained_models_dir,
                                            embedder_config.name),
                    max_length=bert_max_length,
                    requires_grad=embedder_config.params['requires_grad'])
        elif embedder_config.name == 'char_bilstm':
            embedders[embedder_config.name] = TokenCharactersEncoder(
                embedding=Embedding(
                    num_embeddings=vocab.get_vocab_size('token_characters'),
                    embedding_dim=embedder_config.params['char_embedding_dim']
                ),
                encoder=PytorchSeq2VecWrapper(
                    torch.nn.LSTM(
                        embedder_config.params['char_embedding_dim'],
                        embedder_config.params['lstm_dim'],
                        num_layers=embedder_config.params['lstm_num_layers'],
                        dropout=embedder_config.params['lstm_dropout'],
                        bidirectional=True,
                        batch_first=True)),
                dropout=embedder_config.params['dropout'])
        else:
            assert False, 'Unknown embedder {}'.format(embedder_config.name)

    return BasicTextFieldEmbedder(embedders)
Exemplo n.º 6
0
    def _init_from_archive(self, pretrained_model: Model):
        """ Given a TopicRNN instance, take its weights. """
        self.text_field_embedder = pretrained_model.text_field_embedder
        self.vocab_size = pretrained_model.vocab_size
        self.text_encoder = pretrained_model.text_encoder

        # This function is only to be invoved when needing to classify.
        # To avoid manually dealing with padding, instantiate a Seq2Vec instead.
        self.text_to_vec = PytorchSeq2VecWrapper(
            self.text_encoder._modules['_module'])

        self.topic_dim = pretrained_model.topic_dim
        self.vocabulary_projection_layer = pretrained_model.vocabulary_projection_layer
        self.stopword_projection_layer = pretrained_model.stopword_projection_layer
        self.tokens_to_index = pretrained_model.tokens_to_index
        self.stop_indices = pretrained_model.stop_indices
        self.beta = pretrained_model.beta
        self.mu_linear = pretrained_model.mu_linear
        self.sigma_linear = pretrained_model.sigma_linear
        self.noise = pretrained_model.noise
        self.variational_autoencoder = pretrained_model.variational_autoencoder
        self.sentiment_classifier = pretrained_model.sentiment_classifier
Exemplo n.º 7
0
def predict():
	elmo_token_indexer = ELMoTokenCharactersIndexer()

	reader = AnalogyDatasetReader(token_indexers={'tokens':elmo_token_indexer})

	train_dataset, test_dataset, dev_dataset = (reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all.txt", "test_all.txt", "val_all.txt"])

	# elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5)
	elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
	
	word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder})
	lstm_encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True, bidirectional=True))

	vocab2 = Vocabulary.from_files("./vocabulary")
	model2 = LstmModel(word_embeddings, lstm_encoder, vocab2)

	if USE_GPU: model2.cuda()
	else: model2

	with open("./model.th", 'rb') as f:
		model2.load_state_dict(torch.load(f))
	
	predictor2 = SentenceClassifierPredictor(model2, dataset_reader=reader)
	with open('test.txt', 'w+') as f:
		top_10_words_list = []
		for analogy_test in test_dataset:
			logits = predictor2.predict_instance(analogy_test)['logits']
			label_id = np.argmax(logits)
			label_predict = model2.vocab.get_token_from_index(label_id, 'labels')

			top_10_ids = np.argsort(logits)[-10:]
			top_10_words = [model2.vocab.get_token_from_index(id, 'labels') for id in top_10_ids]
			top_10_words_list.append(top_10_words)
			f.write(label_predict + "\n")

	top_10_words_list = np.array(top_10_words_list)
	print(top_10_words_list.shape)
	np.save('elmo_top_10_words_list.npy', np.array(top_10_words_list))
#             [ 0,  0,  5,  0,  0,  0,  4]
#         ]
#     )
# }

# In[1149]:

# embedded_parse_label = word_embedder(parse_label)

# In[1150]:

# embedded_parse_label.shape

# In[1151]:

seq2vec_encoder = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
seq2seq_encoder = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

# In[1152]:

classifier_params = Params({
    "input_dim": HIDDEN_DIM * 2,
    "num_layers": 2,
    "hidden_dims": [50, 3],
    "activations": ["sigmoid", "linear"],
    "dropout": [0.2, 0.0]
})

# In[1153]:
Exemplo n.º 9
0
def main():
	###############################################################################################
	prepare_global_logging(serialization_dir=args.serialization_dir, file_friendly_logging=False)
	#DATA
	reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(),
	                        target_tokenizer=CharacterTokenizer(),
	                        source_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')},
	                        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')},
	                        target=False,
	                        label=True,
	                        lazy=False)
	# train_data = reader.read("../../datasets/math/label-data/train-all")
	# val_data = reader.read("../../datasets/math/label-data/interpolate")
	val_data = reader.read("./generate_files")


	vocab = Vocabulary()
	vocab.add_tokens_to_namespace([START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-', '.', '/',
	                                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?',
	                                    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
	                                    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b',
	                                    'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
	                                    'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}'], namespace='tokens')
	vocab.add_tokens_to_namespace(['algebra', 'arithmetic', 'calculus', 'comparison',
	  								 'measurement', 'numbers', 'polynomials', 'probability'], namespace='labels')



	# MODEL
	embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
	                             embedding_dim=EMBEDDING_DIM)
	source_embedder = BasicTextFieldEmbedder({"tokens": embedding})

	if args.model == 'lstm':
		encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, 
											num_layers=NUM_LAYERS, batch_first=True))
	elif args.model == 'cnn':
		encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, output_dim=HIDDEN_DIM)
	else:
		raise NotImplemented("The classifier model should be LSTM or CNN")


	model = TextClassifier(vocab=vocab,
				source_text_embedder=source_embedder,
	            encoder=encoder,
	            )
	model.to(device)


	if not Path(args.serialization_dir).exists() or not Path(args.serialization_dir).is_dir():
  		raise NotImplementedError("The model seems not to exist")
	with open(Path(args.serialization_dir) / "best.th", "rb") as model_path:
  		model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1))
  		model.load_state_dict(model_state)
	model.eval()

	predictor = TextClassifierPredictor(model, dataset_reader=reader)

	# TEST
	correct = 0
	total = 0

	pbar = tqdm(val_data)
	batch_instance = list()
	batch_gt = list()

	idx_last = 0
	for idx, instance in enumerate(pbar):
		if idx != (idx_last + BATCH_SIZE):
			batch_instance.append(instance)
			batch_gt.append(instance.fields["labels"].label) # str
		else:
			idx_last = idx
			outputs = predictor.predict(batch_instance)
			for i, output in enumerate(outputs):
				if batch_gt[i] == output['predict_labels']:
					correct += 1
				total += 1
			batch_instance = list()
			batch_gt = list()
			pbar.set_description("correct/total %.3f" % (correct / total))
Exemplo n.º 10
0
def main():
    ###############################################################################################
    prepare_global_logging(serialization_dir=args.serialization_dir,
                           file_friendly_logging=False)
    #DATA
    reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(),
                               target_tokenizer=CharacterTokenizer(),
                               source_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target=False,
                               label=True,
                               lazy=True)
    train_data = reader.read("../../datasets/math/label-data/train-all")
    # val_data = reader.read("../../datasets/math/label-data/interpolate")

    vocab = Vocabulary()
    vocab.add_tokens_to_namespace([
        START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-',
        '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<',
        '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y',
        'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
        'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{',
        '}'
    ],
                                  namespace='tokens')
    vocab.add_tokens_to_namespace([
        'algebra', 'arithmetic', 'calculus', 'comparison', 'measurement',
        'numbers', 'polynomials', 'probability'
    ],
                                  namespace='labels')

    # MODEL
    embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                          embedding_dim=EMBEDDING_DIM)
    source_embedder = BasicTextFieldEmbedder({"tokens": embedding})

    if args.model == 'lstm':
        encoder = PytorchSeq2VecWrapper(
            torch.nn.LSTM(EMBEDDING_DIM,
                          HIDDEN_DIM,
                          num_layers=NUM_LAYERS,
                          batch_first=True))
    elif args.model == 'cnn':
        encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM,
                             num_filters=NUM_FILTERS,
                             output_dim=HIDDEN_DIM)
    else:
        raise NotImplemented("The classifier model should be LSTM or CNN")

    model = TextClassifier(
        vocab=vocab,
        source_text_embedder=source_embedder,
        encoder=encoder,
    )
    model.to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=1e-3,
                           betas=(0.9, 0.995),
                           eps=1e-6)

    train_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                    max_instances_in_memory=1024,
                                    sorting_keys=[("source_tokens",
                                                   "num_tokens")])
    train_iterator = MultiprocessIterator(train_iterator, num_workers=16)
    train_iterator.index_with(vocab)

    val_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                  max_instances_in_memory=1024,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
    val_iterator = MultiprocessIterator(val_iterator, num_workers=16)
    val_iterator.index_with(vocab)
    #pdb.set_trace()

    LR_SCHEDULER = {"type": "exponential", "gamma": 0.5, "last_epoch": -1}
    lr_scheduler = LearningRateScheduler.from_params(optimizer,
                                                     Params(LR_SCHEDULER))

    # TRAIN
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=train_iterator,
                      validation_iterator=None,
                      train_dataset=train_data,
                      validation_dataset=None,
                      patience=None,
                      shuffle=True,
                      num_epochs=1,
                      summary_interval=100,
                      learning_rate_scheduler=lr_scheduler,
                      cuda_device=CUDA_DEVICES,
                      grad_norm=5,
                      grad_clipping=5,
                      model_save_interval=600,
                      serialization_dir=args.serialization_dir,
                      keep_serialized_model_every_num_seconds=3600,
                      should_log_parameter_statistics=True,
                      should_log_learning_rate=True)
    trainer.train()