Exemplo n.º 1
0
  def _examples_to_batch(self, batch, max_n_turns):
    if self._support_batch_size == 0:
      return self._wrap_batch(
        textdata.Batch([self._dataset[i] for i in batch if i is not None], self._dataset),
        max_n_turns)

    mb = MetaBatch(max_n_turns=max_n_turns)
    for domain_batch in batch:
      support_ex, target_ex, predict_turn = domain_batch
      support_ex = [self._dataset[i] for i in support_ex]
      target_ex = [self._dataset[i] for i in target_ex]
      if predict_turn is not None:
        # cut off dialogue after predict_turn.
        # downstream, we'll just predict the last one.
        target_ex[0] = deepcopy(target_ex[0])

        turns = getattr(target_ex[0], ModelInput.SEQ)[:predict_turn + 1]
        setattr(target_ex[0], ModelInput.SEQ, turns)

        turns = getattr(target_ex[0], ModelOutput.TOK)[:predict_turn + 1]
        setattr(target_ex[0], ModelOutput.TOK, turns)

      mb.append((self._wrap_batch(textdata.Batch(support_ex, self._dataset), max_n_turns),
                 self._wrap_batch(textdata.Batch(target_ex, self._dataset), max_n_turns)))
    return mb
Exemplo n.º 2
0
    def example_to_batch(self, example, device=None, multiple=False):
        """
        Convert a single InstanceDataset example into a Batch object that can be directly 
        fed into the model. Useful for interactive testing of the model
        """
        if not multiple:
            example = [example]

        if device is None:
            return ttdata.Batch(example, self)
        else:
            return send_instance_to(ttdata.Batch(example, self), device)
Exemplo n.º 3
0
 def print_samples():
     model.eval()
     num_samples = 2
     random_indices = [
         random.randrange(len(valid_dataset)) for _ in range(num_samples)
     ]
     sample_data = [valid_dataset[i] for i in random_indices]
     sample_data.sort(key=lambda s: len(s.src), reverse=True)
     sample_batch = data.Batch(data=sample_data,
                               dataset=valid_dataset,
                               device=args.gpu,
                               train=False)
     generated_sample = generate(sample_batch)
     for i in range(num_samples):
         print(f'  - Sample #{i}')
         src_sentence = ' '.join(
             ids_to_words(ids=sample_batch.src[0][:, i].data,
                          vocab=src_field.vocab,
                          eos_id=src_field.vocab.stoi[src_field.pad_token],
                          remove_eos=True))
         tgt_sentence = ' '.join(
             ids_to_words(ids=sample_batch.tgt[0][:, i].data,
                          vocab=tgt_field.vocab,
                          eos_id=tgt_field.vocab.stoi[tgt_field.eos_token]))
         output_sentence = ' '.join(
             ids_to_words(ids=generated_sample[i],
                          vocab=tgt_field.vocab,
                          eos_id=tgt_field.vocab.stoi[tgt_field.eos_token]))
         print(f'    Source: {src_sentence}')
         print(f'    Target: {tgt_sentence}')
         print(f'    Output: {output_sentence}')
Exemplo n.º 4
0
    def train(self):
        # model might have changed so redefine `optimizer`
        optimizer = self.optimizer(self.model.parameters())

        for epoch in range(1, self.n_epochs + 1):
            print(f"Epoch {epoch}/{self.n_epochs}")

            running_loss = 0.0
            self.model.train()  # turn on training mode
            for step, batch in enumerate(self.train_iterator):

                running_loss += loss.item()
                if step % 150 == 0:
                    acc = accuracy(predictions, batch.label)
                    print(f"Loss: {loss.item()/batch_size}, Accuracy {acc}")

            epoch_loss = running_loss / len(train_data)

            # Calculate the validation loss for this epoch
            self.model.eval()  # turn on evaluation mode
            full_batch = data.Batch(self.valid_data, self.valid_data)
            # Define the accuracy `valid_acc` and the loss `valid_loss` on `full_batch`
            predictions = ...
            valid_loss = ...
            valid_acc = ...

            print(
                f"Epoch: {epoch}, Training Loss: {epoch_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation accuracy: {valid_acc:.4f}"
            )
Exemplo n.º 5
0
def get_qid_batch(qid, qid_questions_dict, qid_anwers_dict, qid_ext_feat_dict,
                  qid_label_dict):
    new_batch = data.Batch()
    question = qid_questions_dict[qid]
    answers = qid_anwers_dict[qid]
    labels = qid_label_dict[qid]
    ext_feats = qid_ext_feat_dict[qid]

    size = len(qid_anwers_dict[qid])
    new_batch.batch_size = size
    new_batch.dataset = "trecqa"

    max_len_a = max([ans.size()[0] for ans in answers])

    padding_answers = []

    for ans in answers:
        padding_answers.append(
            F.pad(ans, (0, max_len_a - ans.size()[0]), value=1))

    setattr(new_batch, "answer", torch.stack(padding_answers))
    setattr(new_batch, "question", torch.stack(question.repeat(size, 1)))
    setattr(new_batch, "ext_feat", torch.stack(ext_feats))
    setattr(new_batch, "label", torch.stack(labels))
    return new_batch
Exemplo n.º 6
0
def get_batch(question, answer, ext_feat, size):
    new_batch = data.Batch()
    new_batch.batch_size = size
    setattr(new_batch, "sentence_2", torch.stack(answer))
    setattr(new_batch, "sentence_1", torch.stack(question))
    setattr(new_batch, "ext_feats", torch.stack(ext_feat))
    return new_batch
Exemplo n.º 7
0
def get_batch(question, answer, ext_feat, size):
    new_batch = data.Batch()
    new_batch.batch_size = size
    new_batch.dataset = batch.dataset
    setattr(new_batch, "answer", torch.stack(answer))
    setattr(new_batch, "question", torch.stack(question))
    setattr(new_batch, "ext_feat", torch.stack(ext_feat))
    return new_batch
Exemplo n.º 8
0
    def test(self):
        with torch.no_grad():
            self.model.eval()  # turn on evaluation mode
            full_batch = data.Batch(self.test_data, self.test_data)
            # Define the accuracy `test_acc` and the loss `test_loss` on `full_batch`
            predictions = ...
            test_loss = ...
            test_acc = ...

            print(f"Test Loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")
Exemplo n.º 9
0
def transform_query(query_text, tags_text, name_textfield_t, name_tagfield_t,
                    dataset):
    #1. Create example from query text
    example = data.Example.fromlist([query_text, tags_text],
                                    [name_textfield_t, name_tagfield_t])
    examples = [example]

    #2. Create batch using example and dataset
    device_type = None if use_cuda else -1
    batch = data.Batch(data=examples, dataset=dataset, device=device_type)

    #3. Return batch
    return batch
Exemplo n.º 10
0
    def input_processor_fn(inputs):
        if not isinstance(inputs, list):
            inputs = [inputs]

        examples = []
        for line in inputs:
            examples.append(data.Example.fromlist([line], fields))

        dataset = data.Dataset(examples, fields)
        # Entire input in one batch
        return data.Batch(
            data=dataset,
            dataset=dataset,
            device=torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu"))
Exemplo n.º 11
0
 def __iter__(self):
     """This code is taken from torchtext.data.Iterator"""
     while True:
         self.init_epoch()
         for idx, minibatch in enumerate(self.batches):
             # fast-forward if loaded from state
             if self._iterations_this_epoch > idx:
                 continue
             self.iterations += 1
             self._iterations_this_epoch += 1
             if self.sort_within_batch:
                 if self.sort:
                     minibatch.reverse()
                 else:
                     minibatch.sort(key=self.sort_key, reverse=True)
             created_batch = data.Batch(minibatch, self.dataset,
                                        self.device)
             created_batch.data_args = {}
             if cfg.augment_input_with_bert_src_vectors:  # this flag is an internal flag and is not set through configurations
                 # This is solely for efficiency purposes, although its not a good idea to combine model logic with input reader!
                 max_len = max(created_batch.src[1]).item()
                 bert_input_sentences = [
                     self.dataset.src_tokenizer.tokenizer.
                     convert_tokens_to_ids(mb.src) +
                     [self.dataset.src_tokenizer.tokenizer.pad_token_id] *
                     (max_len - len(mb.src)) for mb in minibatch
                 ]
                 created_batch.data_args["bert_src"] = bert_input_sentences
             if cfg.augment_input_with_syntax_infusion_vectors:
                 max_len = max(created_batch.src[1]).item()
                 syntax_data = [
                     self.dataset.src_tokenizer.syntax_infused_container.
                     convert(self.dataset.src_tokenizer.detokenize(mb.src),
                             max_len) for mb in minibatch
                 ]
                 for tag in self.dataset.src_tokenizer.syntax_infused_container.features_list:
                     created_batch.data_args["si_" + tag] = [
                         s[tag] for s in syntax_data
                     ]
             yield created_batch
         if not self.repeat:
             return
Exemplo n.º 12
0
    def __init__(self, args, app_data=None):
        """Initialize the data loader, create a dataset and a batch.

        Parameters
        ----------
        args : Args
            An object with all arguments for BiMPM model.
        app_data : list, optional
            A Python list with `q1` and `q2` as keys for two queries
            (default is None).

        """
        super().__init__(args)

        self.fields = [('q1', self.TEXT), ('q2', self.TEXT)]

        self.example = [
            data.Example.fromlist(data=app_data, fields=self.fields)
        ]
        self.dataset = data.Dataset(self.example, self.fields)
        self.batch = data.Batch(self.example, self.dataset, device=args.device)
Exemplo n.º 13
0
 def predict(self, features):
     dict_list = [{'text': feature, 'label': None} for feature in features]
     self.model.eval()
     test_fields_list = [('label', self.LABEL), ('text', self.TEXT)]
     test_fields_json = {
         'label': ('label', self.LABEL),
         'text': ('text', self.TEXT)
     }
     test_examples = [
         data.Example.fromdict(data=dict_data, fields=test_fields_json)
         for dict_data in dict_list
     ]
     test_dataset = data.Dataset(examples=test_examples,
                                 fields=test_fields_list)
     data_batch = data.Batch(test_examples,
                             dataset=test_dataset,
                             device=self.args.gpu,
                             train=False)
     scores = self.model(data_batch.text)
     predicted_results = torch.max(scores,
                                   1)[1].view(data_batch.label.size()).data
     return list(predicted_results)
Exemplo n.º 14
0
def get_input_processor_words(inputs, type_model, vocab_word, vocab_char):
    if "one_sequence" in type_model:

        inputs_word = data.Field(init_token="<bos>",
                                 eos_token="<eos>",
                                 batch_first=True,
                                 include_lengths=True)

        inputs_char_nesting = data.Field(tokenize=list,
                                         init_token="<bos>",
                                         eos_token="<eos>",
                                         batch_first=True)

        inputs_char = data.NestedField(inputs_char_nesting,
                                       init_token="<bos>",
                                       eos_token="<eos>")

        inputs_word.vocab = vocab_word
        inputs_char.vocab = inputs_char_nesting.vocab = vocab_char
        fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))]

        if not isinstance(inputs, list):
            inputs = [inputs]

        examples = []

        for line in inputs:
            examples.append(data.Example.fromlist([line], fields))

        dataset = data.Dataset(examples, fields)
        batchs = data.Batch(
            data=dataset,
            dataset=dataset,
            device=torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu"))

    else:
        inputs_word_query = data.Field(init_token="<bos>",
                                       eos_token="<eos>",
                                       batch_first=True,
                                       include_lengths=True)
        inputs_char_query_nesting = data.Field(tokenize=list,
                                               init_token="<bos>",
                                               eos_token="<eos>",
                                               batch_first=True)
        inputs_char_query = data.NestedField(inputs_char_query_nesting,
                                             init_token="<bos>",
                                             eos_token="<eos>")

        inputs_word_document = data.Field(init_token="<bos>",
                                          eos_token="<eos>",
                                          batch_first=True,
                                          include_lengths=True)
        inputs_char_document_nesting = data.Field(tokenize=list,
                                                  init_token="<bos>",
                                                  eos_token="<eos>",
                                                  batch_first=True)
        inputs_char_document = data.NestedField(inputs_char_document_nesting,
                                                init_token="<bos>",
                                                eos_token="<eos>")

        fields = ([(('inputs_word_query', 'inputs_char_query'),
                    (inputs_word_query, inputs_char_query)),
                   (('inputs_word_document', 'inputs_char_document'),
                    (inputs_word_document, inputs_char_document))])

        inputs_word_query.vocab = inputs_word_document.vocab = vocab_word
        inputs_char_query.vocab = inputs_char_query_nesting.vocab = \
            inputs_char_document_nesting.vocab = inputs_char_document.vocab = vocab_char

        # print(vocab_word.stoi)
        # print(vocab_char.stoi)

        if not isinstance(inputs, list):
            inputs = [inputs]

        examples = []

        for line in inputs:
            tuple_line = line.split("\t")
            example = data.Example.fromlist(tuple_line, fields)
            examples.append(example)

        dataset = data.Dataset(examples, fields)
        batchs = data.Batch(
            data=dataset,
            dataset=dataset,
            device=torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu"))
    # Entire input in one batch
    return batchs
Exemplo n.º 15
0
def get_input_processor_words(inputs, type_model, vocab_word, vocab_char=None):
    if type_model == "word_char_based":

        inputs_word = data.Field(init_token="<bos>",
                                 eos_token="<eos>",
                                 batch_first=True)

        inputs_char_nesting = data.Field(tokenize=list,
                                         init_token="<bos>",
                                         eos_token="<eos>",
                                         batch_first=True)

        inputs_char = data.NestedField(inputs_char_nesting,
                                       init_token="<bos>",
                                       eos_token="<eos>")

        inputs_word.vocab = vocab_word
        if vocab_char is not None:
            inputs_char.vocab = vocab_char
            fields = [(('inputs_word', 'inputs_char'), (inputs_word,
                                                        inputs_char))]
        else:
            fields = [('inputs_word', inputs_word)]

        if not isinstance(inputs, list):
            inputs = [inputs]

        examples = []

        for line in inputs:
            examples.append(data.Example.fromlist([line], fields))

        dataset = data.Dataset(examples, fields)
        batchs = data.Batch(
            data=dataset,
            dataset=dataset,
            device=torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu"))

    else:
        tokenize_word = lambda x: x.split()
        inputs_word = data.Field(tokenize=tokenize_word,
                                 init_token="<bos>",
                                 eos_token="<eos>",
                                 batch_first=True)

        inputs_char = data.Field(tokenize=list,
                                 init_token="<bos>",
                                 eos_token="<eos>",
                                 batch_first=True)

        inputs_word.vocab = vocab_word
        if vocab_char is not None:
            inputs_char.vocab = vocab_char
            fields = [(('inputs_word', 'inputs_char'), (inputs_word,
                                                        inputs_char))]
        else:
            fields = [('inputs_word', inputs_word)]

        if not isinstance(inputs, list):
            inputs = [inputs]

        examples = []

        for line in inputs:
            examples.append(data.Example.fromlist([line], fields))

        dataset = data.Dataset(examples, fields)
        batchs = data.Batch(
            data=dataset,
            dataset=dataset,
            device=torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu"))

    # Entire input in one batch
    return batchs