def _examples_to_batch(self, batch, max_n_turns): if self._support_batch_size == 0: return self._wrap_batch( textdata.Batch([self._dataset[i] for i in batch if i is not None], self._dataset), max_n_turns) mb = MetaBatch(max_n_turns=max_n_turns) for domain_batch in batch: support_ex, target_ex, predict_turn = domain_batch support_ex = [self._dataset[i] for i in support_ex] target_ex = [self._dataset[i] for i in target_ex] if predict_turn is not None: # cut off dialogue after predict_turn. # downstream, we'll just predict the last one. target_ex[0] = deepcopy(target_ex[0]) turns = getattr(target_ex[0], ModelInput.SEQ)[:predict_turn + 1] setattr(target_ex[0], ModelInput.SEQ, turns) turns = getattr(target_ex[0], ModelOutput.TOK)[:predict_turn + 1] setattr(target_ex[0], ModelOutput.TOK, turns) mb.append((self._wrap_batch(textdata.Batch(support_ex, self._dataset), max_n_turns), self._wrap_batch(textdata.Batch(target_ex, self._dataset), max_n_turns))) return mb
def example_to_batch(self, example, device=None, multiple=False): """ Convert a single InstanceDataset example into a Batch object that can be directly fed into the model. Useful for interactive testing of the model """ if not multiple: example = [example] if device is None: return ttdata.Batch(example, self) else: return send_instance_to(ttdata.Batch(example, self), device)
def print_samples(): model.eval() num_samples = 2 random_indices = [ random.randrange(len(valid_dataset)) for _ in range(num_samples) ] sample_data = [valid_dataset[i] for i in random_indices] sample_data.sort(key=lambda s: len(s.src), reverse=True) sample_batch = data.Batch(data=sample_data, dataset=valid_dataset, device=args.gpu, train=False) generated_sample = generate(sample_batch) for i in range(num_samples): print(f' - Sample #{i}') src_sentence = ' '.join( ids_to_words(ids=sample_batch.src[0][:, i].data, vocab=src_field.vocab, eos_id=src_field.vocab.stoi[src_field.pad_token], remove_eos=True)) tgt_sentence = ' '.join( ids_to_words(ids=sample_batch.tgt[0][:, i].data, vocab=tgt_field.vocab, eos_id=tgt_field.vocab.stoi[tgt_field.eos_token])) output_sentence = ' '.join( ids_to_words(ids=generated_sample[i], vocab=tgt_field.vocab, eos_id=tgt_field.vocab.stoi[tgt_field.eos_token])) print(f' Source: {src_sentence}') print(f' Target: {tgt_sentence}') print(f' Output: {output_sentence}')
def train(self): # model might have changed so redefine `optimizer` optimizer = self.optimizer(self.model.parameters()) for epoch in range(1, self.n_epochs + 1): print(f"Epoch {epoch}/{self.n_epochs}") running_loss = 0.0 self.model.train() # turn on training mode for step, batch in enumerate(self.train_iterator): running_loss += loss.item() if step % 150 == 0: acc = accuracy(predictions, batch.label) print(f"Loss: {loss.item()/batch_size}, Accuracy {acc}") epoch_loss = running_loss / len(train_data) # Calculate the validation loss for this epoch self.model.eval() # turn on evaluation mode full_batch = data.Batch(self.valid_data, self.valid_data) # Define the accuracy `valid_acc` and the loss `valid_loss` on `full_batch` predictions = ... valid_loss = ... valid_acc = ... print( f"Epoch: {epoch}, Training Loss: {epoch_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation accuracy: {valid_acc:.4f}" )
def get_qid_batch(qid, qid_questions_dict, qid_anwers_dict, qid_ext_feat_dict, qid_label_dict): new_batch = data.Batch() question = qid_questions_dict[qid] answers = qid_anwers_dict[qid] labels = qid_label_dict[qid] ext_feats = qid_ext_feat_dict[qid] size = len(qid_anwers_dict[qid]) new_batch.batch_size = size new_batch.dataset = "trecqa" max_len_a = max([ans.size()[0] for ans in answers]) padding_answers = [] for ans in answers: padding_answers.append( F.pad(ans, (0, max_len_a - ans.size()[0]), value=1)) setattr(new_batch, "answer", torch.stack(padding_answers)) setattr(new_batch, "question", torch.stack(question.repeat(size, 1))) setattr(new_batch, "ext_feat", torch.stack(ext_feats)) setattr(new_batch, "label", torch.stack(labels)) return new_batch
def get_batch(question, answer, ext_feat, size): new_batch = data.Batch() new_batch.batch_size = size setattr(new_batch, "sentence_2", torch.stack(answer)) setattr(new_batch, "sentence_1", torch.stack(question)) setattr(new_batch, "ext_feats", torch.stack(ext_feat)) return new_batch
def get_batch(question, answer, ext_feat, size): new_batch = data.Batch() new_batch.batch_size = size new_batch.dataset = batch.dataset setattr(new_batch, "answer", torch.stack(answer)) setattr(new_batch, "question", torch.stack(question)) setattr(new_batch, "ext_feat", torch.stack(ext_feat)) return new_batch
def test(self): with torch.no_grad(): self.model.eval() # turn on evaluation mode full_batch = data.Batch(self.test_data, self.test_data) # Define the accuracy `test_acc` and the loss `test_loss` on `full_batch` predictions = ... test_loss = ... test_acc = ... print(f"Test Loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")
def transform_query(query_text, tags_text, name_textfield_t, name_tagfield_t, dataset): #1. Create example from query text example = data.Example.fromlist([query_text, tags_text], [name_textfield_t, name_tagfield_t]) examples = [example] #2. Create batch using example and dataset device_type = None if use_cuda else -1 batch = data.Batch(data=examples, dataset=dataset, device=device_type) #3. Return batch return batch
def input_processor_fn(inputs): if not isinstance(inputs, list): inputs = [inputs] examples = [] for line in inputs: examples.append(data.Example.fromlist([line], fields)) dataset = data.Dataset(examples, fields) # Entire input in one batch return data.Batch( data=dataset, dataset=dataset, device=torch.device( "cuda:0" if torch.cuda.is_available() else "cpu"))
def __iter__(self): """This code is taken from torchtext.data.Iterator""" while True: self.init_epoch() for idx, minibatch in enumerate(self.batches): # fast-forward if loaded from state if self._iterations_this_epoch > idx: continue self.iterations += 1 self._iterations_this_epoch += 1 if self.sort_within_batch: if self.sort: minibatch.reverse() else: minibatch.sort(key=self.sort_key, reverse=True) created_batch = data.Batch(minibatch, self.dataset, self.device) created_batch.data_args = {} if cfg.augment_input_with_bert_src_vectors: # this flag is an internal flag and is not set through configurations # This is solely for efficiency purposes, although its not a good idea to combine model logic with input reader! max_len = max(created_batch.src[1]).item() bert_input_sentences = [ self.dataset.src_tokenizer.tokenizer. convert_tokens_to_ids(mb.src) + [self.dataset.src_tokenizer.tokenizer.pad_token_id] * (max_len - len(mb.src)) for mb in minibatch ] created_batch.data_args["bert_src"] = bert_input_sentences if cfg.augment_input_with_syntax_infusion_vectors: max_len = max(created_batch.src[1]).item() syntax_data = [ self.dataset.src_tokenizer.syntax_infused_container. convert(self.dataset.src_tokenizer.detokenize(mb.src), max_len) for mb in minibatch ] for tag in self.dataset.src_tokenizer.syntax_infused_container.features_list: created_batch.data_args["si_" + tag] = [ s[tag] for s in syntax_data ] yield created_batch if not self.repeat: return
def __init__(self, args, app_data=None): """Initialize the data loader, create a dataset and a batch. Parameters ---------- args : Args An object with all arguments for BiMPM model. app_data : list, optional A Python list with `q1` and `q2` as keys for two queries (default is None). """ super().__init__(args) self.fields = [('q1', self.TEXT), ('q2', self.TEXT)] self.example = [ data.Example.fromlist(data=app_data, fields=self.fields) ] self.dataset = data.Dataset(self.example, self.fields) self.batch = data.Batch(self.example, self.dataset, device=args.device)
def predict(self, features): dict_list = [{'text': feature, 'label': None} for feature in features] self.model.eval() test_fields_list = [('label', self.LABEL), ('text', self.TEXT)] test_fields_json = { 'label': ('label', self.LABEL), 'text': ('text', self.TEXT) } test_examples = [ data.Example.fromdict(data=dict_data, fields=test_fields_json) for dict_data in dict_list ] test_dataset = data.Dataset(examples=test_examples, fields=test_fields_list) data_batch = data.Batch(test_examples, dataset=test_dataset, device=self.args.gpu, train=False) scores = self.model(data_batch.text) predicted_results = torch.max(scores, 1)[1].view(data_batch.label.size()).data return list(predicted_results)
def get_input_processor_words(inputs, type_model, vocab_word, vocab_char): if "one_sequence" in type_model: inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, include_lengths=True) inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") inputs_word.vocab = vocab_word inputs_char.vocab = inputs_char_nesting.vocab = vocab_char fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] if not isinstance(inputs, list): inputs = [inputs] examples = [] for line in inputs: examples.append(data.Example.fromlist([line], fields)) dataset = data.Dataset(examples, fields) batchs = data.Batch( data=dataset, dataset=dataset, device=torch.device( "cuda:0" if torch.cuda.is_available() else "cpu")) else: inputs_word_query = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, include_lengths=True) inputs_char_query_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char_query = data.NestedField(inputs_char_query_nesting, init_token="<bos>", eos_token="<eos>") inputs_word_document = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, include_lengths=True) inputs_char_document_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char_document = data.NestedField(inputs_char_document_nesting, init_token="<bos>", eos_token="<eos>") fields = ([(('inputs_word_query', 'inputs_char_query'), (inputs_word_query, inputs_char_query)), (('inputs_word_document', 'inputs_char_document'), (inputs_word_document, inputs_char_document))]) inputs_word_query.vocab = inputs_word_document.vocab = vocab_word inputs_char_query.vocab = inputs_char_query_nesting.vocab = \ inputs_char_document_nesting.vocab = inputs_char_document.vocab = vocab_char # print(vocab_word.stoi) # print(vocab_char.stoi) if not isinstance(inputs, list): inputs = [inputs] examples = [] for line in inputs: tuple_line = line.split("\t") example = data.Example.fromlist(tuple_line, fields) examples.append(example) dataset = data.Dataset(examples, fields) batchs = data.Batch( data=dataset, dataset=dataset, device=torch.device( "cuda:0" if torch.cuda.is_available() else "cpu")) # Entire input in one batch return batchs
def get_input_processor_words(inputs, type_model, vocab_word, vocab_char=None): if type_model == "word_char_based": inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") inputs_word.vocab = vocab_word if vocab_char is not None: inputs_char.vocab = vocab_char fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] else: fields = [('inputs_word', inputs_word)] if not isinstance(inputs, list): inputs = [inputs] examples = [] for line in inputs: examples.append(data.Example.fromlist([line], fields)) dataset = data.Dataset(examples, fields) batchs = data.Batch( data=dataset, dataset=dataset, device=torch.device( "cuda:0" if torch.cuda.is_available() else "cpu")) else: tokenize_word = lambda x: x.split() inputs_word = data.Field(tokenize=tokenize_word, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_word.vocab = vocab_word if vocab_char is not None: inputs_char.vocab = vocab_char fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] else: fields = [('inputs_word', inputs_word)] if not isinstance(inputs, list): inputs = [inputs] examples = [] for line in inputs: examples.append(data.Example.fromlist([line], fields)) dataset = data.Dataset(examples, fields) batchs = data.Batch( data=dataset, dataset=dataset, device=torch.device( "cuda:0" if torch.cuda.is_available() else "cpu")) # Entire input in one batch return batchs