示例#1
0
    def getNumpyBatch(self):
        if self.current_index + self.batch_size > self.num_inputs:
            print(
                "Calibrating index {:} batch size {:} exceed max input limit {:} sentences"
                .format(self.current_index, self.batch_size, self.num_inputs))
            return None

        current_batch = int(self.current_index / self.batch_size)
        if current_batch % 10 == 0:
            print("Calibrating batch {:}, containing {:} sentences".format(
                current_batch, self.batch_size))

        input_ids = []
        segment_ids = []
        input_mask = []
        for i in range(self.batch_size):
            example = self.data[self.current_index + i]
            features = dp.convert_example_to_features(
                example.doc_tokens, example.question_text, self.tokenizer,
                self.max_seq_length, self.doc_stride, self.max_query_length)
            if len(input_ids) and len(segment_ids) and len(input_mask):
                input_ids = np.concatenate((input_ids, features[0].input_ids))
                segment_ids = np.concatenate(
                    (segment_ids, features[0].segment_ids))
                input_mask = np.concatenate(
                    (input_mask, features[0].input_mask))
            else:
                input_ids = np.array(features[0].input_ids, dtype=np.int32)
                segment_ids = np.array(features[0].segment_ids, dtype=np.int32)
                input_mask = np.array(features[0].input_mask, dtype=np.int32)
        self.current_index += self.batch_size
        self.current_data = [input_ids, input_mask, segment_ids]
        return self.current_data
 def question_features(tokens, question):
     # Extract features from the paragraph and question
     return dp.convert_example_to_features(tokens, question,
                                           self._model.tokenizer,
                                           self._model.max_seq_length,
                                           self._model.doc_stride,
                                           self._model.max_query_length)
示例#3
0
    def get_batch(self, names):
        if self.current_index + self.batch_size > self.num_inputs:
            print("Calibrating index {:} batch size {:} exceed max input limit {:} sentences".format(self.current_index, self.batch_size, self.num_inputs))
            return None

        current_batch = int(self.current_index / self.batch_size)
        if current_batch % 10 == 0:
            print("Calibrating batch {:}, containing {:} sentences".format(current_batch, self.batch_size))

        input_ids = []
        segment_ids = []
        input_mask = []
        for i in range(self.batch_size):
            example = self.data[self.current_index + i]
            features = dp.convert_example_to_features(example.doc_tokens, example.question_text, self.tokenizer, self.max_seq_length, self.doc_stride, self.max_query_length)
            if len(input_ids) and len(segment_ids) and len(input_mask):
                input_ids = np.concatenate((input_ids, features[0].input_ids))
                segment_ids = np.concatenate((segment_ids, features[0].segment_ids))
                input_mask = np.concatenate((input_mask, features[0].input_mask))
            else:
                input_ids = features[0].input_ids
                segment_ids = features[0].segment_ids
                input_mask = features[0].input_mask

        cuda.memcpy_htod(self.device_inputs[0], input_ids.ravel())
        cuda.memcpy_htod(self.device_inputs[1], segment_ids.ravel())
        cuda.memcpy_htod(self.device_inputs[2], input_mask.ravel())

        self.current_index += self.batch_size
        return self.device_inputs
示例#4
0
 def question_features(tokens, question):
     # Extract features from the paragraph and question
     return dp.convert_example_to_features(tokens, question, tokenizer,
                                           max_seq_length, doc_stride,
                                           args.max_query_length)
示例#5
0
 def question_features(tokens, question):
     # Extract features from the paragraph and question
     tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
     return dp.convert_example_to_features(tokens, question, tokenizer, args.sequence_length, 128, 64)