def batch_generator(): todo = list(i for i in range(len(q_ids)) if is_eval or i not in not_allowed) if not is_eval: self._rng.shuffle(todo) while todo: support_lengths = list() question_lengths = list() wiq = list() spans = list() span2question = [] offsets = [] at_spans = [] unique_words, unique_word_lengths, question2unique, support2unique = \ unique_words_with_chars(q_tokenized, s_tokenized, self.char_vocab, todo[:self.batch_size]) # we have to create batches here and cannot precompute them because of the batch-specific wiq feature for i, j in enumerate(todo[:self.batch_size]): support = s_ids[j] for k in range(len(support)): emb_supports[i, k] = self._get_emb(support[k]) question = q_ids[j] for k in range(len(question)): emb_questions[i, k] = self._get_emb(question[k]) support_lengths.append(s_lengths[j]) question_lengths.append(q_lengths[j]) aps = [s for s in answer_spans[j] if s[1] - s[0] <= _max_span_size or is_eval] spans.extend(aps) span2question.extend(i for _ in aps) wiq.append(word_in_question[j]) offsets.append(token_offsets[j]) at_spans.append(answertype_spans[j]) batch_size = len(question_lengths) output = { XQAPorts.unique_word_chars: unique_words, XQAPorts.unique_word_char_length: unique_word_lengths, XQAPorts.question_words2unique: question2unique, XQAPorts.support_words2unique: support2unique, XQAPorts.emb_support: emb_supports[:batch_size, :max(support_lengths), :], XQAPorts.support_length: support_lengths, XQAPorts.emb_question: emb_questions[:batch_size, :max(question_lengths), :], XQAPorts.question_length: question_lengths, XQAPorts.word_in_question: wiq, XQAPorts.answer_span: spans, XQAPorts.correct_start_training: [] if is_eval else [s[0] for s in spans], XQAPorts.answer2question: span2question, XQAPorts.answer2question_training: [] if is_eval else span2question, XQAPorts.keep_prob: 1.0 if is_eval else 1 - self.dropout, XQAPorts.is_eval: is_eval, XQAPorts.token_char_offsets: offsets, CBOWXqaPorts.answer_type_span: at_spans } # we can only numpify in here, because bucketing is not possible prior batch = numpify(output, keys=[XQAPorts.unique_word_chars, XQAPorts.question_words2unique, XQAPorts.support_words2unique, XQAPorts.word_in_question, XQAPorts.token_char_offsets]) todo = todo[self.batch_size:] yield batch
def __call__(self, qa_settings: List[QASetting]) -> Mapping[TensorPort, np.ndarray]: corpus = self.preprocess(qa_settings, test_time=True) x_dict = { Ports.Input.multiple_support: corpus["support"], Ports.Input.question: corpus["question"], Ports.Input.atomic_candidates: corpus["candidates"] } return numpify(x_dict)
def __call__( self, qa_settings: List[QASetting]) -> Mapping[TensorPort, np.ndarray]: corpus = self.preprocess(qa_settings, test_time=True) xy_dict = { Ports.Input.question: corpus["question"], Ports.Input.atomic_candidates: corpus["candidates"], Ports.Targets.target_index: corpus["answers"] } return numpify(xy_dict)
def test_numpify(): def _fillna(xs): data = np.array(xs) lens = np.array([len(i) for i in data]) mask = np.arange(lens.max()) < lens[:, None] out = np.zeros(mask.shape, dtype=data.dtype) out[mask] = np.concatenate(data) return out data = [[1, 2, 3], [4, 5], [6, 7, 8]] data_np = map.numpify(data) for a, b in zip([np.array(x) for x in data], data_np): assert (a == b).all() data = {0: [[1, 2, 3]], 1: [[4, 5], [6, 7, 8]], 2: [[6, 7, 8]]} data_np = map.numpify(data) for ak, bk in zip(data.keys(), data_np.keys()): a, b = data[ak], data_np[bk] assert (_fillna(a) == b).all()
def __call__( self, qa_settings: List[QASetting]) -> Mapping[TensorPort, np.ndarray]: q_tokenized, q_ids, q_lengths, s_tokenized, s_ids, s_lengths, \ word_in_question, token_offsets, answer_spans,slot= prepare_data(qa_settings, self.vocab, self.config.get("lowercase", False), with_answers=False) unique_words, unique_word_lengths, question2unique, support2unique = \ unique_words_with_chars(q_tokenized, s_tokenized, self.char_vocab) batch_size = len(qa_settings) emb_supports = np.zeros( [batch_size, max(s_lengths), self.emb_matrix.shape[1]]) emb_questions = np.zeros( [batch_size, max(q_lengths), self.emb_matrix.shape[1]]) for i, q in enumerate(q_ids): for k, v in enumerate(s_ids[i]): emb_supports[i, k] = self._get_emb(v) for k, v in enumerate(q): emb_questions[i, k] = self._get_emb(v) output = { XQAPorts.unique_word_chars: unique_words, XQAPorts.unique_word_char_length: unique_word_lengths, XQAPorts.question_words2unique: question2unique, XQAPorts.support_words2unique: support2unique, XQAPorts.emb_support: emb_supports, XQAPorts.support_length: s_lengths, XQAPorts.emb_question: emb_questions, XQAPorts.question_length: q_lengths, XQAPorts.slot_list: slot, XQAPorts.word_in_question: word_in_question, XQAPorts.token_char_offsets: token_offsets, Ports.Input.question: q_ids } output = numpify(output, keys=[ XQAPorts.unique_word_chars, XQAPorts.question_words2unique, XQAPorts.support_words2unique, XQAPorts.word_in_question, XQAPorts.token_char_offsets, XQAPorts.slot_list, Ports.Input.question ]) return output
def get_batches(data, batch_size=32, pad=0, bucket_order=None, bucket_structure=None, exact_epoch=False): """ Creates generator that batches `data`. To avoid biases, it is advised to keep `bucket_order=None` and `bucket_structure=None` if computationally possible. (which will sample batches from all instances) Args: `data`: dict with (multi-dimensional) numpy arrays or (nested) lists; first inner dimension (`num_instances`) should be the same over all data values. `batch_size`: the desired batch size `pad`: padding symbol in case data contains lists of lists of different sizes `bucket_order`: argument `order` in get_buckets (list with keys); `None` if no bucketing `bucket_structure`: argument `structure` in get_buckets; `None` if no bucketing `exact_epoch`: if set to `True`, final batch per bucket may be smaller, but each instance will be seen exactly once during training. Default: `False`, to be certain during training that each instance per batch gets same weight in the total loss (but not all instances are observed per epoch if bucket sizes are no multiple of `batch_size`). Returns: a generator that generates a dict with same keys as `data`, and as values data batches consisting of `[batch_size x num_instances]` 2D numpy tensors (1st dimension is at most `batch_size` but may be smaller to cover all instances exactly once per epoch, if `exact_epoch=True`) """ assert isinstance(data, dict) data0 = list(data.values())[0] if not isinstance(data0, np.ndarray): data_np = numpify( data, pad) # still need original data for length-based bucketing else: data_np = data def get_bucket_probs(_buckets2instances): N = float(np.sum([len(ids) for ids in _buckets2instances.values()])) return { bid: len(ids) / N if N > 0. else 0. for bid, ids in _buckets2instances.items() } def shuffle_buckets(_buckets2instances): for bid in sorted( _buckets2instances.keys()): # sorted: to keep deterministic rs.shuffle(_buckets2instances[bid]) buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure) n_buckets = len(buckets2instances) exact_epoch = True if len(data0) < n_buckets * batch_size else exact_epoch #if average instances/bucket smaller than batch_size: set exact_epoch = True #to avoid empty batches during debugging on small data samples def bucket_generator(): buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure) shuffle_buckets(buckets2instances) all_seen = False while not all_seen: bids, probs = zip( *sorted(get_bucket_probs(buckets2instances).items(), key=lambda x: x[0])) # sorted keys: to keep deterministic if np.sum(probs) == 0.: all_seen = True else: bid = rs.choice( bids, replace=False, p=probs) # sample bucket according to remaining size batch_indices = buckets2instances[bid][:batch_size] buckets2instances[bid] = buckets2instances[bid][batch_size:] # if required by exact_epoch: also include last batch in bucket if too small if len(batch_indices) == batch_size or exact_epoch: yield {k: data_np[k][batch_indices] for k in data_np} return GeneratorWithRestart(bucket_generator)