def test_get_dimensions_is_correct(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=2, hidden_size=7, batch_first=True) encoder = PytorchSeq2VecWrapper(lstm) assert encoder.get_output_dim() == 14 assert encoder.get_input_dim() == 2 lstm = LSTM(bidirectional=False, num_layers=3, input_size=2, hidden_size=7, batch_first=True) encoder = PytorchSeq2VecWrapper(lstm) assert encoder.get_output_dim() == 7 assert encoder.get_input_dim() == 2
def __init__(self, vocab_size: int, embedding_size: int, hidden_size: int, label_size: int = 9, dropout: float = 0.2, user_feats_dim: int = 20): super().__init__() self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.label_size = label_size self.dropout = dropout self.user_feats_dim = user_feats_dim self.embedding = nn.Embedding(vocab_size, embedding_size) self.sentence_rnn = PytorchSeq2VecWrapper(nn.GRU(embedding_size, hidden_size, batch_first=True, bidirectional=True)) self.review_rnn = PytorchSeq2VecWrapper(nn.GRU(hidden_size * 2, hidden_size, batch_first=True, bidirectional=True)) self.product_rnn = nn.GRU(hidden_size * 2 + self.user_feats_dim, hidden_size, batch_first=True, bidirectional=True) self.review_feedforward = nn.Sequential( nn.Linear(hidden_size * 2 + self.user_feats_dim, hidden_size // 2), nn.ELU(), nn.Dropout(p=dropout), nn.Linear(hidden_size // 2, 1) ) self.product_feedforward = nn.Sequential( nn.Linear(hidden_size * 2, hidden_size // 2), nn.ELU(), nn.Dropout(p=dropout), nn.Linear(hidden_size // 2, self.label_size) ) if self.user_feats_dim > 0: self.user_feats_weights = nn.Parameter(torch.ones(self.user_feats_dim), requires_grad=True)
def test_forward_pulls_out_correct_tensor_with_sequence_lengths(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) encoder = PytorchSeq2VecWrapper(lstm) input_tensor = torch.rand([5, 7, 3]) input_tensor[1, 6:, :] = 0 input_tensor[2, 4:, :] = 0 input_tensor[3, 2:, :] = 0 input_tensor[4, 1:, :] = 0 mask = torch.ones(5, 7).bool() mask[1, 6:] = False mask[2, 4:] = False mask[3, 2:] = False mask[4, 1:] = False sequence_lengths = get_lengths_from_binary_sequence_mask(mask) packed_sequence = pack_padded_sequence(input_tensor, sequence_lengths.tolist(), batch_first=True) _, state = lstm(packed_sequence) # Transpose output state, extract the last forward and backward states and # reshape to be of dimension (batch_size, 2 * hidden_size). reshaped_state = state[0].transpose(0, 1)[:, -2:, :].contiguous() explicitly_concatenated_state = torch.cat([ reshaped_state[:, 0, :].squeeze(1), reshaped_state[:, 1, :].squeeze(1) ], -1) encoder_output = encoder(input_tensor, mask) assert_almost_equal(encoder_output.data.numpy(), explicitly_concatenated_state.data.numpy())
def test_forward_works_even_with_empty_sequences(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=11, batch_first=True) encoder = PytorchSeq2VecWrapper(lstm) tensor = torch.rand([5, 7, 3]) tensor[1, 6:, :] = 0 tensor[2, :, :] = 0 tensor[3, 2:, :] = 0 tensor[4, :, :] = 0 mask = torch.ones(5, 7).bool() mask[1, 6:] = False mask[2, :] = False mask[3, 2:] = False mask[4, :] = False results = encoder(tensor, mask) for i in (0, 1, 3): assert not (results[i] == 0.0).data.all() for i in (2, 4): assert (results[i] == 0.0).data.all()
def get_encoder(input_dim, output_dim, encoder_type, args): if encoder_type == "bag": return BagOfEmbeddingsEncoder(input_dim) if encoder_type == "bilstm": return PytorchSeq2VecWrapper( AllenNLPSequential(torch.nn.ModuleList( [get_encoder(input_dim, output_dim, "bilstm-unwrapped", args)]), input_dim, output_dim, bidirectional=True, residual_connection=args.residual_connection, dropout=args.dropout)) if encoder_type == "bilstm-unwrapped": return torch.nn.LSTM( input_dim, output_dim, batch_first=True, bidirectional=True, dropout=args.dropout, ) if encoder_type == "cnn": return CnnEncoder(embedding_dim=input_dim, num_filters=output_dim) if encoder_type == "cnn_highway": filter_size: int = output_dim // 4 return CnnHighwayEncoder( embedding_dim=input_dim, filters=[(2, filter_size), (3, filter_size), (4, filter_size), (5, filter_size)], projection_dim=output_dim, num_highway=3, do_layer_norm=True, ) raise RuntimeError(f"Unknown encoder type={encoder_type}")
def test_wrapper_raises_if_batch_first_is_false(self): with pytest.raises(ConfigurationError): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7) _ = PytorchSeq2VecWrapper(lstm)
def __init__(self, e_dim, h_dim, num_layers=1, dropout=0.0, base_rnn=nn.LSTM, dropout_p=0.1, bidirectional=False, batch_first=True, memmory_embed=None, use_memory=False, mem_size=None, mem_context_size=None, inv_temp=None, use_binary=False): super(HashedMemoryRNN, self).__init__() self.acc_slots = 10 self.memory_embeddings = memmory_embed self.e_dim = e_dim self.hidden_size = h_dim #self.hh = [Hash(self.memory_embeddings.get_output_dim(), self.mem_size) for _ in range(self.acc_slots)] self.memory = KeyValueMemory(use_memory=use_memory, emb_dim=self.e_dim, mem_size=mem_size, mem_context_size=mem_context_size, inv_temp=inv_temp, use_binary=use_binary) self.lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(self.memory.get_input_size(), h_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=batch_first)) self.softmax = torch.nn.Softmax() if USE_CUDA: self.lstm = self.lstm.cuda() self.memory = self.memory.cuda()
def test_forward_pulls_out_correct_tensor_with_sequence_lengths(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) encoder = PytorchSeq2VecWrapper(lstm) tensor = torch.rand([5, 7, 3]) tensor[1, 6:, :] = 0 tensor[2, 4:, :] = 0 tensor[3, 2:, :] = 0 tensor[4, 1:, :] = 0 input_tensor = Variable(tensor) sequence_lengths = Variable(torch.LongTensor([7, 6, 4, 2, 1])) packed_sequence = pack_padded_sequence(input_tensor, list(sequence_lengths.data), batch_first=True) _, state = lstm(packed_sequence) # Transpose output state, extract the last forward and backward states and # reshape to be of dimension (batch_size, 2 * hidden_size). reshaped_state = state[0].transpose(0, 1)[:, -2:, :].contiguous() explicitly_concatenated_state = torch.cat([ reshaped_state[:, 0, :].squeeze(1), reshaped_state[:, 1, :].squeeze(1) ], -1) encoder_output = encoder(input_tensor, sequence_lengths) assert_almost_equal(encoder_output.data.numpy(), explicitly_concatenated_state.data.numpy())
def test_forward_pulls_out_correct_tensor_without_sequence_lengths(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=2, hidden_size=7, batch_first=True) encoder = PytorchSeq2VecWrapper(lstm) input_tensor = Variable(torch.FloatTensor([[[.7, .8], [.1, 1.5]]])) lstm_output = lstm(input_tensor) encoder_output = encoder(input_tensor, None) assert_almost_equal(encoder_output.data.numpy(), lstm_output[0].data.numpy()[:, -1, :])
def test_forward_pulls_out_correct_tensor_with_unsorted_batches(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) encoder = PytorchSeq2VecWrapper(lstm) tensor = torch.rand([5, 7, 3]) tensor[0, 3:, :] = 0 tensor[1, 4:, :] = 0 tensor[2, 2:, :] = 0 tensor[3, 6:, :] = 0 mask = torch.ones(5, 7) mask[0, 3:] = 0 mask[1, 4:] = 0 mask[2, 2:] = 0 mask[3, 6:] = 0 input_tensor = Variable(tensor) mask = Variable(mask) sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices = sort_batch_by_length(input_tensor, sequence_lengths) packed_sequence = pack_padded_sequence(sorted_inputs, sorted_sequence_lengths.data.tolist(), batch_first=True) _, state = lstm(packed_sequence) # Transpose output state, extract the last forward and backward states and # reshape to be of dimension (batch_size, 2 * hidden_size). sorted_transposed_state = state[0].transpose(0, 1).index_select(0, restoration_indices) reshaped_state = sorted_transposed_state[:, -2:, :].contiguous() explicitly_concatenated_state = torch.cat([reshaped_state[:, 0, :].squeeze(1), reshaped_state[:, 1, :].squeeze(1)], -1) encoder_output = encoder(input_tensor, mask) assert_almost_equal(encoder_output.data.numpy(), explicitly_concatenated_state.data.numpy())
def main(): reader = TatoebaSentenceReader() train_set = reader.read('data/mt/sentences.top10langs.train.tsv') dev_set = reader.read('data/mt/sentences.top10langs.dev.tsv') vocab = Vocabulary.from_instances(train_set, min_count={'tokens': 3}) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) positive_label = vocab.get_token_index('eng', namespace='labels') model = LstmClassifier(word_embeddings, encoder, vocab, positive_label=positive_label) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_set, validation_dataset=dev_set, num_epochs=3) trainer.train()
def get_wrapped_encoder(encoder_list): return PytorchSeq2VecWrapper( AllenNLPSequential(torch.nn.ModuleList(encoder_list), elmo_embedding_dim, hidden_dim, bidirectional=True, residual_connection=residual_connection, dropout=dropout))
def test_wrapper_works_with_alternating_lstm(self): model = PytorchSeq2VecWrapper( StackedAlternatingLstm(input_size=4, hidden_size=5, num_layers=3)) input_tensor = torch.randn(2, 3, 4) mask = torch.ones(2, 3).bool() output = model(input_tensor, mask) assert tuple(output.size()) == (2, 5)
def multitask_learning(): # load datasetreader # Save logging to a local file # Multitasking log.getLogger().addHandler(log.FileHandler(directory+"/log.log")) lr = 0.00001 batch_size = 2 epochs = 10 max_seq_len = 512 max_span_width = 30 #token_indexer = BertIndexer(pretrained_model="bert-base-uncased", max_pieces=max_seq_len, do_lowercase=True,) token_indexer = PretrainedBertIndexer("bert-base-cased", do_lowercase=False) conll_reader = ConllCorefBertReader(max_span_width = max_span_width, token_indexers = {"tokens": token_indexer}) swag_reader = SWAGDatasetReader(tokenizer=token_indexer.wordpiece_tokenizer,lazy=True, token_indexers=token_indexer) EMBEDDING_DIM = 1024 HIDDEN_DIM = 200 conll_datasets, swag_datasets = load_datasets(conll_reader, swag_reader, directory) conll_vocab = Vocabulary() swag_vocab = Vocabulary() conll_iterator = BasicIterator(batch_size=batch_size) conll_iterator.index_with(conll_vocab) swag_vocab = Vocabulary() swag_iterator = BasicIterator(batch_size=batch_size) swag_iterator.index_with(swag_vocab) from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder bert_embedder = PretrainedBertEmbedder(pretrained_model="bert-base-cased",top_layer_only=True, requires_grad=True) word_embedding = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True) BERT_DIM = word_embedding.get_output_dim() seq2seq = PytorchSeq2SeqWrapper(torch.nn.LSTM(BERT_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True)) seq2vec = PytorchSeq2VecWrapper(torch.nn.LSTM(BERT_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True)) mention_feedforward = FeedForward(input_dim = 2336, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU()) antecedent_feedforward = FeedForward(input_dim = 7776, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU()) model1 = CoreferenceResolver(vocab=conll_vocab, text_field_embedder=word_embedding,context_layer= seq2seq, mention_feedforward=mention_feedforward,antecedent_feedforward=antecedent_feedforward , feature_size=768,max_span_width=max_span_width,spans_per_word=0.4,max_antecedents=250,lexical_dropout= 0.2) model2 = SWAGExampleModel(vocab=swag_vocab, text_field_embedder=word_embedding, phrase_encoder=seq2vec) optimizer1 = optim.Adam(model1.parameters(), lr=lr) optimizer2 = optim.Adam(model2.parameters(), lr=lr) swag_train_iterator = swag_iterator(swag_datasets[0], num_epochs=1, shuffle=True) conll_train_iterator = conll_iterator(conll_datasets[0], num_epochs=1, shuffle=True) swag_val_iterator = swag_iterator(swag_datasets[1], num_epochs=1, shuffle=True) conll_val_iterator:q = conll_iterator(conll_datasets[1], num_epochs=1, shuffle=True) task_infos = {"swag": {"model": model2, "optimizer": optimizer2, "loss": 0.0, "iterator": swag_iterator, "train_data": swag_datasets[0], "val_data": swag_datasets[1], "num_train": len(swag_datasets[0]), "num_val": len(swag_datasets[1]), "lr": lr, "score": {"accuracy":0.0}}, \ "conll": {"model": model1, "iterator": conll_iterator, "loss": 0.0, "val_data": conll_datasets[1], "train_data": conll_datasets[0], "optimizer": optimizer1, "num_train": len(conll_datasets[0]), "num_val": len(conll_datasets[1]),"lr": lr, "score": {"coref_prediction": 0.0, "coref_recall": 0.0, "coref_f1": 0.0,"mention_recall": 0.0}}} USE_GPU = 1 trainer = MultiTaskTrainer( task_infos=task_infos, num_epochs=epochs, serialization_dir=directory + "saved_models/multitask/" ) metrics = trainer.train()
def main(): elmo_token_indexer = ELMoTokenCharactersIndexer() reader = StanfordSentimentTreeBankDatasetReader( token_indexers={'tokens': elmo_token_indexer}) train_dataset = reader.read( 'data/stanfordSentimentTreebank/trees/train.txt') dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt') # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) # Use the 'Small' pre-trained model options_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' ) weight_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' ) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) # Seq2VecEncoder is a neural network abstraction that takes a sequence of something # (usually a sequence of embedded word vectors), processes it, and returns a single # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but # AllenNLP also supports CNNs and other simple architectures (for example, # just averaging over the input vectors). encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=20) trainer.train()
def __init__(self, config, device, vocab_size, pad_idx=0): super().__init__() self.emb_dim = config.pop("embedding_dim") self.hidden_size = config.pop("hidden_size") self.d = numpy.sqrt(self.hidden_size) self.vocab_size = vocab_size self.pad_idx = pad_idx self.embedding = Embedding(self.vocab_size, self.emb_dim, padding_idx=self.pad_idx) self.state_embedder = PytorchSeq2SeqWrapper( LSTM(batch_first=True, input_size=self.emb_dim, hidden_size=self.hidden_size)) self.state_recurrence = PytorchSeq2VecWrapper( GRU( batch_first=True, input_size=self.hidden_size, hidden_size=self.hidden_size, )) self.action_embedder = PytorchSeq2VecWrapper( GRU(batch_first=True, input_size=self.emb_dim, hidden_size=self.hidden_size)) self.recipe_embedder = PytorchSeq2VecWrapper( LSTM(batch_first=True, input_size=self.emb_dim, hidden_size=self.hidden_size)) self.state_to_hidden = Linear(self.hidden_size, self.hidden_size) self.state_to_hidden2 = Linear(self.hidden_size, self.hidden_size // 2) self.action_to_hidden = Linear(self.hidden_size, self.hidden_size) self.action_to_hidden2 = Linear(self.hidden_size, self.hidden_size // 2) self.elu = ELU() self.device = device
def init_gru(vocab, d_embedding, hidden_rnn_sz, rnn_num_layers, rnn_dropout, all_code_types, feedforward_num_layers, feedforward_hidden_dims, feedforward_activations, feedforward_dropout, leadlag, add_time, t_max, t_scale, use_timestamps, split_paths): """Construct and train GRU""" # Init feedward params feedforward_hidden_dims = [feedforward_hidden_dims] * feedforward_num_layers feedforward_activations = [Activation.by_name(feedforward_activations)()] * feedforward_num_layers feedforward_dropout = [feedforward_dropout] * feedforward_num_layers # Needed for final layer feedforward_num_layers += 1 feedforward_hidden_dims.append(1) feedforward_activations.append(Activation.by_name('linear')()) feedforward_dropout.append(0) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=d_embedding) # Handle Augmentations augmentations = [] if add_time: augmentations.append('add_time') if leadlag: augmentations.append('leadlag') d_embedding_updated = update_dims(augmentations, d_embedding) i_augmentations = init_augmentations(augmentations, use_timestamps=use_timestamps, t_max=t_max, t_scale=t_scale) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=d_embedding) # Embedder maps the input tokens to the appropriate embedding matrix word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding}) # Encoder takes path of (N, L, C) and encodes into state vector # encoder = BagOfEmbeddingsEncoder(embedding_dim=d_embedding) encoder: Seq2VecEncoder = PytorchSeq2VecWrapper( nn.GRU(d_embedding_updated, hidden_rnn_sz, num_layers=rnn_num_layers, batch_first=True, dropout=rnn_dropout)) classifier_feedforward: FeedForward = FeedForward( input_dim=encoder.get_output_dim() * 3 if (all_code_types and split_paths) else encoder.get_output_dim(), num_layers=feedforward_num_layers, hidden_dims=feedforward_hidden_dims, activations=feedforward_activations, dropout=feedforward_dropout ) model = BaseModel( vocab, word_embeddings, encoder, classifier_feedforward, augmentations=i_augmentations ) return model
def main(): reader = StanfordSentimentTreeBankDatasetReader() train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt' dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt' sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"]) train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler) dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler) # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(), dev_data_loader.iter_instances()), min_count={'tokens': 3}) train_data_loader.index_with(vocab) dev_data_loader.index_with(vocab) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) # Seq2VecEncoder is a neural network abstraction that takes a sequence of something # (usually a sequence of embedded word vectors), processes it, and returns a single # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but # AllenNLP also supports CNNs and other simple architectures (for example, # just averaging over the input vectors). encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) trainer = GradientDescentTrainer(model=model, optimizer=optimizer, data_loader=train_data_loader, validation_data_loader=dev_data_loader, patience=10, num_epochs=20, cuda_device=-1) trainer.train() predictor = SentenceClassifierPredictor(model, dataset_reader=reader) logits = predictor.predict('This is the best movie ever!')['logits'] label_id = np.argmax(logits) print(model.vocab.get_token_from_index(label_id, 'labels'))
def gru_encoder(input_dim: int, output_dim: int, num_layers: int = 1, bidirectional: bool = False, dropout: float = 0.0 ) -> Seq2VecEncoder: """ Our encoder is going to be an GRU. We have to wrap it for AllenNLP, though. """ return PytorchSeq2VecWrapper(torch.nn.GRU( input_dim, output_dim, batch_first=True, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout))
def train_only_swag(): # load datasetreader # Save logging to a local file # Multitasking log.getLogger().addHandler(log.FileHandler(directory+"/log.log")) lr = 0.00001 batch_size = 2 epochs = 100 max_seq_len = 512 max_span_width = 30 #token_indexer = BertIndexer(pretrained_model="bert-base-uncased", max_pieces=max_seq_len, do_lowercase=True,) token_indexer = PretrainedBertIndexer("bert-base-cased", do_lowercase=False) swag_reader = SWAGDatasetReader(tokenizer=token_indexer.wordpiece_tokenizer,lazy=True, token_indexers=token_indexer) EMBEDDING_DIM = 1024 HIDDEN_DIM = 200 swag_datasets = load_swag(swag_reader, directory) swag_vocab = Vocabulary() swag_vocab = Vocabulary() swag_iterator = BasicIterator(batch_size=batch_size) swag_iterator.index_with(swag_vocab) from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder bert_embedder = PretrainedBertEmbedder(pretrained_model="bert-base-cased",top_layer_only=True, requires_grad=True) word_embedding = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True) BERT_DIM = word_embedding.get_output_dim() seq2vec = PytorchSeq2VecWrapper(torch.nn.LSTM(BERT_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True)) mention_feedforward = FeedForward(input_dim = 2336, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU()) antecedent_feedforward = FeedForward(input_dim = 7776, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU()) model = SWAGExampleModel(vocab=swag_vocab, text_field_embedder=word_embedding, phrase_encoder=seq2vec) optimizer = optim.Adam(model.parameters(), lr=lr) USE_GPU =1 val_iterator = swag_iterator(swag_datasets[1], num_epochs=1, shuffle=True) trainer = Trainer( model=model, optimizer=optimizer, iterator=swag_iterator, validation_iterator = swag_iterator, train_dataset=swag_datasets[0], validation_dataset = swag_datasets[1], validation_metric = "+accuracy", cuda_device=0 if USE_GPU else -1, serialization_dir= directory + "saved_models/current_run_model_state_swag", num_epochs=epochs, ) metrics = trainer.train() # save the model with open(directory + "saved_models/current_run_model_state", 'wb') as f: torch.save(model.state_dict(), f)
def build_model(options_file, weight_file): vocab = Vocabulary() iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(), config.hidden_size, bidirectional=True, batch_first=True)) model = BaselineModel(word_embeddings, encoder, vocab) return model, iterator, vocab
def __init__(self, config: Config): super().__init__() for k, v in vars(config).items(): setattr(self, k, v) self.bert = pt.BertModel.from_pretrained(self.pretrained_data_dir) self.embedding_dim = self.bert.config.to_dict()['hidden_size'] self.dropout = nn.Dropout(self.dropout) self.sentence_rnn = PytorchSeq2VecWrapper( nn.GRU(self.embedding_dim, self.hidden_dim, batch_first=True, bidirectional=True)) self.review_rnn = PytorchSeq2VecWrapper( nn.GRU(self.sentence_rnn.get_output_dim(), self.hidden_dim, batch_first=True, bidirectional=True)) self.product_rnn = nn.GRU(self.hidden_dim * 2 + self.user_feats_dim, self.hidden_dim, batch_first=True, bidirectional=True) self.review_feedforward = nn.Sequential( nn.Linear(self.hidden_dim * 2 + self.user_feats_dim, self.hidden_dim // 2), self.dropout, nn.ELU(), nn.Linear(self.hidden_dim // 2, 1)) self.product_feedforward = nn.Sequential( nn.Linear(self.hidden_dim * 2, self.hidden_dim // 2), self.dropout, nn.ELU(), nn.Linear(self.hidden_dim // 2, self.output_dim)) if self.user_feats_dim > 0: self.user_feats_weights = nn.Parameter( torch.ones(self.user_feats_dim))
def get_encoder(voc: Vocabulary, embed_out_dim: int, name: str = config.embedder): if name == 'bert': bert = BertSentencePooler(voc) bert.out_dim = embed_out_dim return bert else: return PytorchSeq2VecWrapper( module=nn.GRU(embed_out_dim, dropout=config.dropout, hidden_size=config.lstm_hid_size, bidirectional=True, batch_first=True))
def main(): cuda_device = -1 torch.manual_seed(SEED) elmo_embedder = ElmoTokenEmbedder(OPTION_FILE, WEIGHT_FILE) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) lstm = PytorchSeq2VecWrapper( torch.nn.LSTM(word_embeddings.get_output_dim(), HIDDEN_DIM, bidirectional=True, batch_first=True)) train_dataset, dev_dataset = dataset_reader(train=True, elmo=True) vocab = Vocabulary() model = BaseModel(word_embeddings=word_embeddings, encoder=lstm, vocabulary=vocab) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) iterator = data_iterator(vocab) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, cuda_device=cuda_device, num_epochs=EPOCHS, patience=5) trainer.train() print("*******Save Model*******\n") output_elmo_model_file = os.path.join(PRETRAINED_ELMO, "lstm_elmo_model.bin") torch.save(model.state_dict(), output_elmo_model_file)
def __init__(self, conf_a): super(Generator, self).__init__() self.cf_a = conf_a self.LSTM_mode = 0 if(self.LSTM_mode == 0): self.encLinear1 = nn.Linear(conf_a.Z_dim, conf_a.H_enc1) else: self.encLinear1 = PytorchSeq2VecWrapper(torch.nn.LSTM(conf_a.Z_dim, hidden_size = conf_a.H_enc1, batch_first=True, bidirectional = False, num_layers = 1, dropout = 0.0)) # GENERATOR self.activation_func_enc1 = conf_a.activation_func_enc1 self.hidden_to_signal = nn.Linear(conf_a.H_enc1, conf_a.D_in) ## Optimizer self.optimizer = pytut.get_optimizers(self, self.cf_a)
def __init__(self, indexer: DocumentIndexer, embedding_matrix: torch.Tensor, dims=None): super(SampleEncoder, self).__init__() if dims is None: dims = default_dims self.dims = dims words_emb_size = embedding_matrix.size(1) self.word_embedder = nn.Embedding.from_pretrained(embedding_matrix) self.word_dropout = nn.Dropout(dims['dropout_input']) self.char_embedder = nn.Embedding(len(indexer.char_vocab), dims['char_emb_size']) self.case_embedder = nn.Embedding(len(indexer.case_vocab), dims['case_emb_size']) self.pos_embedder = nn.Embedding(len(indexer.pos_vocab), dims['pos_emb_size']) self.ner_embedder = nn.Embedding(len(indexer.ner_vocab), dims['ner_emb_size']) self.char_encoder = PytorchSeq2VecWrapper( nn.LSTM(dims['char_emb_size'], dims['chars_hidden'], batch_first=True, bidirectional=True)) total_emb_size = words_emb_size + dims['case_emb_size'] + 2 * dims['chars_hidden'] \ + dims['pos_emb_size'] + dims['ner_emb_size'] self.encoder = PytorchSeq2SeqWrapper( nn.LSTM(total_emb_size, dims['hidden'], batch_first=True, bidirectional=True, num_layers=2)) self.sent_dropout = nn.Dropout(dims['dropout_lstm']) self.feedforward = FeedForward(2 * dims['hidden'], 1, dims['feedforward'], activations=nn.Tanh()) self.attention = nn.Linear(2 * dims['hidden'], dims['attention_dim']) self.scores = nn.Linear(dims['attention_dim'], 1) self.hidden2tag = nn.Linear(2 * dims['hidden'], len(indexer.relation_type_vocab)) self.out_dropout = nn.Dropout(dims['dropout_lstm'])
def main(): reader = StanfordSentimentTreeBankDatasetReader() train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt') dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt') # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, lstm, vocab) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=20) trainer.train() tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!'] predictor = SentenceClassifierPredictor(model, dataset_reader=reader) logits = predictor.predict(tokens)['logits'] label_id = np.argmax(logits) print(model.vocab.get_token_from_index(label_id, 'labels'))
def main(): reader = StanfordSentimentTreeBankDatasetReader() train_dataset = reader.read( 'data/stanfordSentimentTreebank/trees/train.txt') dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt') # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) # Seq2VecEncoder is a neural network abstraction that takes a sequence of something # (usually a sequence of embedded word vectors), processes it, and returns a single # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but # AllenNLP also supports CNNs and other simple architectures (for example, # just averaging over the input vectors). encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=20) trainer.train()
def __init__(self, args, out_sz: int, vocab: Vocabulary): super().__init__(vocab) # prepare embeddings token_embedding = Embedding(num_embeddings=args.max_vocab_size + 2, embedding_dim=300, padding_index=0) self.word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) self.encoder: Seq2VecEncoder = PytorchSeq2VecWrapper( nn.LSTM(self.word_embeddings.get_output_dim(), hidden_size=64, bidirectional=True, batch_first=True)) self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz) self.loss = nn.CrossEntropyLoss()
def main(): reader = TatoebaSentenceReader() train_set = reader.read('data/tatoeba/sentences.top10langs.train.tsv') dev_set = reader.read('data/tatoeba/sentences.top10langs.dev.tsv') vocab = Vocabulary.from_instances(train_set, min_count={'tokens': 3}) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) positive_label = vocab.get_token_index('eng', namespace='labels') model = LstmClassifier(word_embeddings, encoder, vocab, positive_label=positive_label) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_set, validation_dataset=dev_set, num_epochs=10) trainer.train() classify('Take your raincoat in case it rains.', model) classify('Tu me recuerdas a mi padre.', model) classify('Wie organisierst du das Essen am Mittag?', model) classify("Il est des cas où cette règle ne s'applique pas.", model) classify('Estou fazendo um passeio em um parque.', model) classify('Ve, postmorgaŭ jam estas la limdato.', model) classify('Credevo che sarebbe venuto.', model) classify('Nem tudja, hogy én egy macska vagyok.', model) classify('Nella ur nli qrib acemma deg tenwalt.', model) classify('Kurşun kalemin yok, değil mi?', model)