Exemplo n.º 1
0
        def batch_generator():
            todo = list(i for i in range(len(q_ids)) if is_eval or i not in not_allowed)
            if not is_eval:
                self._rng.shuffle(todo)
            while todo:
                support_lengths = list()
                question_lengths = list()
                wiq = list()
                spans = list()
                span2question = []
                offsets = []
                at_spans = []

                unique_words, unique_word_lengths, question2unique, support2unique = \
                    unique_words_with_chars(q_tokenized, s_tokenized, self.char_vocab, todo[:self.batch_size])

                # we have to create batches here and cannot precompute them because of the batch-specific wiq feature
                for i, j in enumerate(todo[:self.batch_size]):
                    support = s_ids[j]
                    for k in range(len(support)):
                        emb_supports[i, k] = self._get_emb(support[k])
                    question = q_ids[j]
                    for k in range(len(question)):
                        emb_questions[i, k] = self._get_emb(question[k])
                    support_lengths.append(s_lengths[j])
                    question_lengths.append(q_lengths[j])
                    aps = [s for s in answer_spans[j] if s[1] - s[0] <= _max_span_size or is_eval]
                    spans.extend(aps)
                    span2question.extend(i for _ in aps)
                    wiq.append(word_in_question[j])
                    offsets.append(token_offsets[j])
                    at_spans.append(answertype_spans[j])

                batch_size = len(question_lengths)
                output = {
                    XQAPorts.unique_word_chars: unique_words,
                    XQAPorts.unique_word_char_length: unique_word_lengths,
                    XQAPorts.question_words2unique: question2unique,
                    XQAPorts.support_words2unique: support2unique,
                    XQAPorts.emb_support: emb_supports[:batch_size, :max(support_lengths), :],
                    XQAPorts.support_length: support_lengths,
                    XQAPorts.emb_question: emb_questions[:batch_size, :max(question_lengths), :],
                    XQAPorts.question_length: question_lengths,
                    XQAPorts.word_in_question: wiq,
                    XQAPorts.answer_span: spans,
                    XQAPorts.correct_start_training: [] if is_eval else [s[0] for s in spans],
                    XQAPorts.answer2question: span2question,
                    XQAPorts.answer2question_training: [] if is_eval else span2question,
                    XQAPorts.keep_prob: 1.0 if is_eval else 1 - self.dropout,
                    XQAPorts.is_eval: is_eval,
                    XQAPorts.token_char_offsets: offsets,
                    CBOWXqaPorts.answer_type_span: at_spans
                }

                # we can only numpify in here, because bucketing is not possible prior
                batch = numpify(output, keys=[XQAPorts.unique_word_chars,
                                              XQAPorts.question_words2unique, XQAPorts.support_words2unique,
                                              XQAPorts.word_in_question, XQAPorts.token_char_offsets])
                todo = todo[self.batch_size:]
                yield batch
 def __call__(self, qa_settings: List[QASetting]) -> Mapping[TensorPort, np.ndarray]:
     corpus = self.preprocess(qa_settings, test_time=True)
     x_dict = {
         Ports.Input.multiple_support: corpus["support"],
         Ports.Input.question: corpus["question"],
         Ports.Input.atomic_candidates: corpus["candidates"]
     }
     return numpify(x_dict)
Exemplo n.º 3
0
 def __call__(
         self,
         qa_settings: List[QASetting]) -> Mapping[TensorPort, np.ndarray]:
     corpus = self.preprocess(qa_settings, test_time=True)
     xy_dict = {
         Ports.Input.question: corpus["question"],
         Ports.Input.atomic_candidates: corpus["candidates"],
         Ports.Targets.target_index: corpus["answers"]
     }
     return numpify(xy_dict)
Exemplo n.º 4
0
def test_numpify():
    def _fillna(xs):
        data = np.array(xs)
        lens = np.array([len(i) for i in data])
        mask = np.arange(lens.max()) < lens[:, None]
        out = np.zeros(mask.shape, dtype=data.dtype)
        out[mask] = np.concatenate(data)
        return out

    data = [[1, 2, 3], [4, 5], [6, 7, 8]]
    data_np = map.numpify(data)

    for a, b in zip([np.array(x) for x in data], data_np):
        assert (a == b).all()

    data = {0: [[1, 2, 3]], 1: [[4, 5], [6, 7, 8]], 2: [[6, 7, 8]]}
    data_np = map.numpify(data)

    for ak, bk in zip(data.keys(), data_np.keys()):
        a, b = data[ak], data_np[bk]
        assert (_fillna(a) == b).all()
Exemplo n.º 5
0
    def __call__(
            self,
            qa_settings: List[QASetting]) -> Mapping[TensorPort, np.ndarray]:
        q_tokenized, q_ids, q_lengths, s_tokenized, s_ids, s_lengths, \
        word_in_question, token_offsets, answer_spans,slot= prepare_data(qa_settings, self.vocab,
                                                                     self.config.get("lowercase", False),
                                                                     with_answers=False)

        unique_words, unique_word_lengths, question2unique, support2unique = \
            unique_words_with_chars(q_tokenized, s_tokenized, self.char_vocab)

        batch_size = len(qa_settings)
        emb_supports = np.zeros(
            [batch_size, max(s_lengths), self.emb_matrix.shape[1]])
        emb_questions = np.zeros(
            [batch_size, max(q_lengths), self.emb_matrix.shape[1]])

        for i, q in enumerate(q_ids):
            for k, v in enumerate(s_ids[i]):
                emb_supports[i, k] = self._get_emb(v)
            for k, v in enumerate(q):
                emb_questions[i, k] = self._get_emb(v)

        output = {
            XQAPorts.unique_word_chars: unique_words,
            XQAPorts.unique_word_char_length: unique_word_lengths,
            XQAPorts.question_words2unique: question2unique,
            XQAPorts.support_words2unique: support2unique,
            XQAPorts.emb_support: emb_supports,
            XQAPorts.support_length: s_lengths,
            XQAPorts.emb_question: emb_questions,
            XQAPorts.question_length: q_lengths,
            XQAPorts.slot_list: slot,
            XQAPorts.word_in_question: word_in_question,
            XQAPorts.token_char_offsets: token_offsets,
            Ports.Input.question: q_ids
        }

        output = numpify(output,
                         keys=[
                             XQAPorts.unique_word_chars,
                             XQAPorts.question_words2unique,
                             XQAPorts.support_words2unique,
                             XQAPorts.word_in_question,
                             XQAPorts.token_char_offsets, XQAPorts.slot_list,
                             Ports.Input.question
                         ])

        return output
def get_batches(data,
                batch_size=32,
                pad=0,
                bucket_order=None,
                bucket_structure=None,
                exact_epoch=False):
    """
    Creates generator that batches `data`.
    To avoid biases, it is advised to keep `bucket_order=None` and `bucket_structure=None` if computationally possible.
    (which will sample batches from all instances)

    Args:
        `data`: dict with (multi-dimensional) numpy arrays or (nested) lists;
            first inner dimension (`num_instances`) should be the same over all data values.
        `batch_size`: the desired batch size
        `pad`: padding symbol in case data contains lists of lists of different sizes
        `bucket_order`: argument `order` in get_buckets (list with keys); `None` if no bucketing
        `bucket_structure`: argument `structure` in get_buckets; `None` if no bucketing
        `exact_epoch`: if set to `True`, final batch per bucket may be smaller, but each instance will be seen exactly
            once during training. Default: `False`, to be certain during training
            that each instance per batch gets same weight in the total loss
            (but not all instances are observed per epoch if bucket sizes are no multiple of `batch_size`).

    Returns:
        a generator that generates a dict with same keys as `data`, and
        as values data batches consisting of `[batch_size x num_instances]` 2D numpy tensors
        (1st dimension is at most `batch_size` but may be smaller to cover all instances exactly once per epoch,
        if `exact_epoch=True`)
     """
    assert isinstance(data, dict)

    data0 = list(data.values())[0]
    if not isinstance(data0, np.ndarray):
        data_np = numpify(
            data, pad)  # still need original data for length-based bucketing
    else:
        data_np = data

    def get_bucket_probs(_buckets2instances):
        N = float(np.sum([len(ids) for ids in _buckets2instances.values()]))
        return {
            bid: len(ids) / N if N > 0. else 0.
            for bid, ids in _buckets2instances.items()
        }

    def shuffle_buckets(_buckets2instances):
        for bid in sorted(
                _buckets2instances.keys()):  # sorted: to keep deterministic
            rs.shuffle(_buckets2instances[bid])

    buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure)
    n_buckets = len(buckets2instances)

    exact_epoch = True if len(data0) < n_buckets * batch_size else exact_epoch

    #if average instances/bucket smaller than batch_size: set exact_epoch = True
    #to avoid empty batches during debugging on small data samples

    def bucket_generator():
        buckets2instances, _ = get_buckets(data, bucket_order,
                                           bucket_structure)
        shuffle_buckets(buckets2instances)
        all_seen = False
        while not all_seen:
            bids, probs = zip(
                *sorted(get_bucket_probs(buckets2instances).items(),
                        key=lambda x: x[0]))
            # sorted keys: to keep deterministic
            if np.sum(probs) == 0.:
                all_seen = True
            else:
                bid = rs.choice(
                    bids, replace=False,
                    p=probs)  # sample bucket according to remaining size
                batch_indices = buckets2instances[bid][:batch_size]
                buckets2instances[bid] = buckets2instances[bid][batch_size:]
                # if required by exact_epoch: also include last batch in bucket if too small
                if len(batch_indices) == batch_size or exact_epoch:
                    yield {k: data_np[k][batch_indices] for k in data_np}

    return GeneratorWithRestart(bucket_generator)