示例#1
0
 def load_data(self, prepared_data_file=None):
     prepared_data_file = prepared_data_file or self.prepared_data_file
     print(f"Loading prepared data from {prepared_data_file} ...")
     data = torch.load(prepared_data_file)
     self.data = {"train": Dataset(data['train']),
                  "valid": Dataset(data["valid"]),
                  "test": Dataset(data["test"])}
     print("Number of examples:",
           " ".join(f"{k.upper()}-{len(v)}" for k, v in self.data.items()))
示例#2
0
 def transform(self, data_file, batch_size,
               data_type="test", shuffle=False, device=None):
     """
     Transform raw text from data_file to Dataset and create data loader.
     """
     raw_data = self.read_data(data_file, data_type=data_type)
     examples = self.build_examples(raw_data)
     data = Dataset(examples)
     data_loader = data.create_batches(batch_size, shuffle, device)
     return data_loader
 def load_data(self, prepared_data_file=None):
     """
     load_data
     """
     prepared_data_file = prepared_data_file or self.prepared_data_file
     print("Loading prepared data from {} ...".format(prepared_data_file))
     data = torch.load(prepared_data_file)
     self.data = {"train": Dataset(data['train']),
                  "valid": Dataset(data["valid"]),
                  "test": Dataset(data["test"])}
     print("Number of examples:",
           " ".join("{}-{}".format(k.upper(), len(v)) for k, v in self.data.items()))
示例#4
0
    def reload(self, data_type='test'):
        data_file = os.path.join(self.data_dir, self.data_prefix + "." + data_type)
        data_raw = self.read_data(data_file, data_type="test")
        data_examples = self.build_examples(data_raw)
        self.data[data_type] = Dataset(data_examples)

        print("Number of examples:",
              " ".join(f"{k.upper()}-{len(v)}" for k, v in self.data.items()))
    def reload(self, data_type='test'):
        """
        reload
        训练时把demo.test覆盖了,测试时不用删除,reload会重新搞的
        """
        data_file = os.path.join(self.data_dir, self.data_prefix + "." + data_type)
        data_raw = self.read_data(data_file, data_type="test")
        data_examples = self.build_examples(data_raw)
        self.data[data_type] = Dataset(data_examples)

        print("Number of examples:",
              " ".join("{}-{}".format(k.upper(), len(v)) for k, v in self.data.items()))
示例#6
0
    def reload(self, data_type='test', data_file=None):
        """
        reload
        """
        if data_file is not None:
            data_raw = self.read_data(data_file, data_type="test")
            data_examples = self.build_examples(data_raw)
            self.data[data_type] = Dataset(data_examples)

        print(
            "Number of examples:", " ".join("{}-{}".format(k.upper(), len(v))
                                            for k, v in self.data.items()))
示例#7
0
    def gen_response(self, contexts):
        """
        Return a list of responses to each context.

        :param contexts: list, a list of context, each context is a dict that contains the dialogue history and personal
                         profile of each speaker
                         this dict contains following keys:

                         context['dialog']: a list of string, dialogue histories (tokens in each utterances
                                            are separated using spaces).
                         context['uid']: a list of int, indices to the profile of each speaker
                         context['profile']: a list of dict, personal profiles for each speaker
                         context['responder_profile']: dict, the personal profile of the responder

        :return: list, responses for each context, each response is a list of tokens.

        e.g.
        contexts:
        [{ "dialog": [ ["How are you ?"], ["I am fine , thank you . And you ?"] ],
          "uid": [0, 1],
          "profile":[ { "loc":"Beijing", "gender":"male", "tag":"" },
                      { "loc":"Shanghai", "gender":"female", "tag":"" } ],
          "responder_profile":{ "loc":"Beijing", "gender":"male", "tag":"" }
        }]

        ==>  [['I', 'am', 'fine', 'too', '!']]
        """
        test_raw = self.read_data(contexts[0])
        test_data = self.corpus.build_examples(test_raw, data_type='test')
        dataset = Dataset(test_data)
        data_iter = dataset.create_batches(batch_size=1,
                                           shuffle=False,
                                           device=self.config.gpu)
        results = self.generator.generate(batch_iter=data_iter)
        res = [result.preds[0].split(" ") for result in results]
        return res
示例#8
0
文件: corpus.py 项目: zwycodes/PEDNet
    def reload(self, data_type='test'):
        """
        reload
        """
        data_file1 = os.path.join(self.data_dir,
                                  self.data_prefix + "." + 'test1')
        data_file2 = os.path.join(self.data_dir,
                                  self.data_prefix + "." + 'test2')
        data_raw1, data_raw2 = self.read_data_multitask(data_file1,
                                                        data_file2,
                                                        data_type="test")
        data_examples1 = self.build_examples(data_raw1)
        data_examples2 = self.build_examples(data_raw2)
        self.data[data_type] = Dataset((data_examples1, data_examples2))

        print(
            "Number of examples:", " ".join("{}-{}".format(k.upper(), len(v))
                                            for k, v in self.data.items()))
示例#9
0
    def next_word_probability(self, context, partial_out):
        """
        Return probability distribution over next words given a partial true output.
        This is used to calculate the per-word perplexity.

        :param context: dict, contexts containing the dialogue history and personal
                        profile of each speaker
                        this dict contains following keys:

                        context['dialog']: a list of string, dialogue histories (tokens in each utterances
                                           are separated using spaces).
                        context['uid']: a list of int, indices to the profile of each speaker
                        context['profile']: a list of dict, personal profiles for each speaker
                        context['responder_profile']: dict, the personal profile of the responder

        :param partial_out: list, previous "true" words
        :return: a list, the first element is a dict, where each key is a word and each value is a probability
                         score for that word. Unset keys assume a probability of zero.
                         the second element is the probability for the EOS token

        e.g.
        context:
        { "dialog": [ ["How are you ?"], ["I am fine , thank you . And you ?"] ],
          "uid": [0, 1],
          "profile":[ { "loc":"Beijing", "gender":"male", "tag":"" },
                      { "loc":"Shanghai", "gender":"female", "tag":"" } ],
          "responder_profile":{ "loc":"Beijing", "gender":"male", "tag":"" }
        }

        partial_out:
        ['I', 'am']

        ==>  {'fine': 0.9}, 0.1
        """
        test_raw = self.read_data(context)
        test_data = self.corpus.build_examples(test_raw, data_type='test')
        dataset = Dataset(test_data)
        data_iter = dataset.create_batches(batch_size=1,
                                           shuffle=False,
                                           device=self.config.gpu)
        inputs = None
        for batch in data_iter:
            inputs = batch
            break

        partial_out_idx = [
            self.stoi[s] if s in self.stoi.keys() else self.stoi['<unk>']
            for s in partial_out
        ]

        # switch the model to evaluate mode
        self.model.eval()
        with torch.no_grad():
            enc_outputs, dec_init_state = self.model.encode(inputs)
            long_tensor_type = torch.cuda.LongTensor if self.config.use_gpu else torch.LongTensor

            # Initialize the input vector
            input_var = long_tensor_type([self.BOS] * 1)
            # Inflate the initial hidden states to be of size: (1, H)
            dec_state = dec_init_state.inflate(1)

            for t in range(len(partial_out_idx)):
                # Run the RNN one step forward
                output, dec_state, attn = self.model.decode(
                    input_var, dec_state)
                input_var = long_tensor_type([partial_out_idx[t]])

            output, dec_state, attn = self.model.decode(input_var, dec_state)
            log_softmax_output = output.squeeze(1)
        log_softmax_output = log_softmax_output.cpu().numpy()
        prob_output = [math.exp(i) for i in log_softmax_output[0]]

        # The first 4 tokens are: '<pad>' '<unk>' '<bos>' '<eos>'
        freq_dict = {}
        for i in range(4, len(self.itos)):
            freq_dict[self.itos[i]] = prob_output[i]
        eos_prob = prob_output[3]
        return freq_dict, eos_prob