def generator(): # custom bucketing, load corpus into memory corpus = list(x for x in (samples() if callable(samples) else samples)) lengths = [self.len_of_sent(i) for i in corpus] if len(corpus) < 32: n_buckets = 1 else: n_buckets = min(self.config.n_buckets, len(corpus)) buckets = dict(zip(*kmeans(lengths, n_buckets))) sizes, buckets = zip(*[ (size, bucket) for size, bucket in buckets.items() ]) # the number of chunks in each bucket, which is clipped by # range [1, len(bucket)] chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in zip(sizes, buckets)] range_fn = randperm if shuffle else arange max_samples_per_batch = self.config.get('max_samples_per_batch', None) for i in tolist(range_fn(len(buckets))): split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1 for j in range(chunks[i])] # how many sentences in each batch for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes): indices = [buckets[i][j] for j in tolist(batch_indices)] if max_samples_per_batch: for j in range(0, len(indices), max_samples_per_batch): yield from self.batched_inputs_to_batches(corpus, indices[j:j + max_samples_per_batch], shuffle) else: yield from self.batched_inputs_to_batches(corpus, indices, shuffle)
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable: arc_preds, rel_preds, mask = Y sents = [] for arc_sent, rel_sent, length in zip(arc_preds, rel_preds, tf.math.count_nonzero(mask, axis=-1)): arcs = tolist(arc_sent)[1:length + 1] rels = tolist(rel_sent)[1:length + 1] sents.append([(a, self.rel_vocab.idx_to_token[r]) for a, r in zip(arcs, rels)]) return sents
def generator(): # custom bucketing, load corpus into memory corpus = list(x for x in (samples() if callable(samples) else samples)) lengths = [1 + len(i) for i in corpus] if len(corpus) < 32: n_buckets = 1 else: n_buckets = min(self.config.n_buckets, len(corpus)) buckets = dict(zip(*kmeans(lengths, n_buckets))) sizes, buckets = zip(*[ (size, bucket) for size, bucket in buckets.items() ]) # the number of chunks in each bucket, which is clipped by # range [1, len(bucket)] chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in zip(sizes, buckets)] range_fn = randperm if shuffle else arange for i in tolist(range_fn(len(buckets))): split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1 for j in range(chunks[i])] for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes): indices = [buckets[i][j] for j in tolist(batch_indices)] raw_batch = [[], [], [], []] max_len = len(max([corpus[i] for i in indices], key=len)) for idx in indices: arc = np.zeros((max_len, max_len), dtype=np.bool) rel = np.zeros((max_len, max_len), dtype=np.int64) for b in raw_batch[:2]: b.append([]) for m, cells in enumerate(corpus[idx]): for b, c, v in zip(raw_batch, cells, [self.form_vocab, self.cpos_vocab]): b[-1].append(v.get_idx_without_add(c)) for n, r in zip(cells[2], cells[3]): arc[m, n] = True rid = self.rel_vocab.get_idx_without_add(r) if rid is None: logger.warning(f'Relation OOV: {r} not exists in train') continue rel[m, n] = rid raw_batch[-2].append(arc) raw_batch[-1].append(rel) batch = [] for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]): b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post', value=v.safe_pad_token_idx, dtype='int64') batch.append(b) batch += raw_batch[2:] assert len(batch) == 4 yield (batch[0], batch[1]), (batch[2], batch[3])
def generator(): # custom bucketing, load corpus into memory corpus = list( x for x in (samples() if callable(samples) else samples)) lengths = [1 + len(i) for i in corpus] if len(corpus) < 32: n_buckets = 1 else: n_buckets = min(self.config.n_buckets, len(corpus)) buckets = dict(zip(*kmeans(lengths, n_buckets))) sizes, buckets = zip(*[(size, bucket) for size, bucket in buckets.items()]) # the number of chunks in each bucket, which is clipped by # range [1, len(bucket)] chunks = [ min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in zip(sizes, buckets) ] range_fn = randperm if shuffle else arange for i in tolist(range_fn(len(buckets))): split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1 for j in range(chunks[i])] for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes): indices = [buckets[i][j] for j in tolist(batch_indices)] raw_batch = [[], [], [], []] for idx in indices: for b in raw_batch: b.append([]) for cells in corpus[idx]: for b, c, v in zip(raw_batch, cells, [ self.form_vocab, self.cpos_vocab, None, self.rel_vocab ]): b[-1].append( v.get_idx_without_add(c) if v else c) batch = [] for b, v in zip(raw_batch, [ self.form_vocab, self.cpos_vocab, None, self.rel_vocab ]): b = tf.keras.preprocessing.sequence.pad_sequences( b, padding='post', value=v.safe_pad_token_idx if v else 0, dtype='int64') batch.append(b) assert len(batch) == 4 yield (batch[0], batch[1]), (batch[2], batch[3])
def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: if len(X) == 2: form_batch, cposes_batch = X mask = tf.not_equal(form_batch, 0) elif len(X) == 3: form_batch, cposes_batch, mask = X else: raise ValueError(f'Expect X to be 2 or 3 elements but got {repr(X)}') sents = [] for form_sent, cposes_sent, length in zip(form_batch, cposes_batch, tf.math.count_nonzero(mask, axis=-1)): forms = tolist(form_sent)[1:length + 1] cposes = tolist(cposes_sent)[1:length + 1] sents.append([(self.form_vocab.idx_to_token[f], self.cpos_vocab.idx_to_token[c]) for f, c in zip(forms, cposes)]) return sents
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable: arc_preds, rel_preds, mask = Y sents = [] for arc_sent, rel_sent, length in zip(arc_preds, rel_preds, tf.math.count_nonzero(mask, axis=-1)): sent = [] for arc, rel in zip(tolist(arc_sent[1:, 1:]), tolist(rel_sent[1:, 1:])): ar = [] for idx, (a, r) in enumerate(zip(arc, rel)): if a: ar.append((idx + 1, self.rel_vocab.idx_to_token[r])) if not ar: # orphan ar.append((0, self.orphan_relation)) sent.append(ar) sents.append(sent) return sents