def getNumpyBatch(self): if self.current_index + self.batch_size > self.num_inputs: print( "Calibrating index {:} batch size {:} exceed max input limit {:} sentences" .format(self.current_index, self.batch_size, self.num_inputs)) return None current_batch = int(self.current_index / self.batch_size) if current_batch % 10 == 0: print("Calibrating batch {:}, containing {:} sentences".format( current_batch, self.batch_size)) input_ids = [] segment_ids = [] input_mask = [] for i in range(self.batch_size): example = self.data[self.current_index + i] features = dp.convert_example_to_features( example.doc_tokens, example.question_text, self.tokenizer, self.max_seq_length, self.doc_stride, self.max_query_length) if len(input_ids) and len(segment_ids) and len(input_mask): input_ids = np.concatenate((input_ids, features[0].input_ids)) segment_ids = np.concatenate( (segment_ids, features[0].segment_ids)) input_mask = np.concatenate( (input_mask, features[0].input_mask)) else: input_ids = np.array(features[0].input_ids, dtype=np.int32) segment_ids = np.array(features[0].segment_ids, dtype=np.int32) input_mask = np.array(features[0].input_mask, dtype=np.int32) self.current_index += self.batch_size self.current_data = [input_ids, input_mask, segment_ids] return self.current_data
def question_features(tokens, question): # Extract features from the paragraph and question return dp.convert_example_to_features(tokens, question, self._model.tokenizer, self._model.max_seq_length, self._model.doc_stride, self._model.max_query_length)
def get_batch(self, names): if self.current_index + self.batch_size > self.num_inputs: print("Calibrating index {:} batch size {:} exceed max input limit {:} sentences".format(self.current_index, self.batch_size, self.num_inputs)) return None current_batch = int(self.current_index / self.batch_size) if current_batch % 10 == 0: print("Calibrating batch {:}, containing {:} sentences".format(current_batch, self.batch_size)) input_ids = [] segment_ids = [] input_mask = [] for i in range(self.batch_size): example = self.data[self.current_index + i] features = dp.convert_example_to_features(example.doc_tokens, example.question_text, self.tokenizer, self.max_seq_length, self.doc_stride, self.max_query_length) if len(input_ids) and len(segment_ids) and len(input_mask): input_ids = np.concatenate((input_ids, features[0].input_ids)) segment_ids = np.concatenate((segment_ids, features[0].segment_ids)) input_mask = np.concatenate((input_mask, features[0].input_mask)) else: input_ids = features[0].input_ids segment_ids = features[0].segment_ids input_mask = features[0].input_mask cuda.memcpy_htod(self.device_inputs[0], input_ids.ravel()) cuda.memcpy_htod(self.device_inputs[1], segment_ids.ravel()) cuda.memcpy_htod(self.device_inputs[2], input_mask.ravel()) self.current_index += self.batch_size return self.device_inputs
def question_features(tokens, question): # Extract features from the paragraph and question return dp.convert_example_to_features(tokens, question, tokenizer, max_seq_length, doc_stride, args.max_query_length)
def question_features(tokens, question): # Extract features from the paragraph and question tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True) return dp.convert_example_to_features(tokens, question, tokenizer, args.sequence_length, 128, 64)