class TSVTaggingTransform(TsvTaggingFormat, Transform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, use_char=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.word_vocab: Optional[Vocab] = None self.tag_vocab: Optional[Vocab] = None self.char_vocab: Optional[Vocab] = None def fit(self, trn_path: str, **kwargs) -> int: self.word_vocab = Vocab() self.tag_vocab = Vocab(pad_token=None, unk_token=None) num_samples = 0 for words, tags in generator_words_tags(trn_path, gold=True, lower=self.config.get( 'lower', False)): self.word_vocab.update(words) self.tag_vocab.update(tags) num_samples += 1 if self.char_vocab: self.char_vocab = Vocab() for word in self.word_vocab.token_to_idx.keys(): if word in (self.word_vocab.pad_token, self.word_vocab.unk_token): continue self.char_vocab.update(list(word)) return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = tf.string, tf.string shapes = [None], [None] values = self.word_vocab.pad_token, self.tag_vocab.first_token return types, shapes, values def inputs_to_samples(self, inputs, gold=False): lower = self.config.get('lower', False) if gold: if lower: for x, y in inputs: yield x.lower(), y else: yield from inputs else: for x in inputs: yield x.lower() if lower else x, [self.padding_values[-1] ] * len(x) def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: return self.word_vocab.lookup(x) def y_to_idx(self, y) -> tf.Tensor: return self.tag_vocab.lookup(y) def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: for xs in X: words = [] for x in xs: words.append( str_tensor_to_str(x) if self.char_vocab else self. word_vocab.idx_to_token[int(x)]) yield words def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, **kwargs) -> Iterable: masks = Y._keras_mask if hasattr(Y, '_keras_mask') else tf.ones_like(Y) if not gold: Y = tf.argmax(Y, axis=2) for ys, mask in zip(Y, masks): tags = [] for y, m in zip(ys, mask): if not m: break tags.append(self.tag_vocab.idx_to_token[int(y)]) yield tags def input_is_single_sample( self, input: Union[List[str], List[List[str]]]) -> bool: return isinstance(input[0], str) def input_truth_output_to_str(self, input: List[str], truth: List[str], output: List[str]): text = '' for word, gold_tag, pred_tag in zip(input, truth, output): text += ' '.join([word, gold_tag, pred_tag]) + '\n' text += '\n' return text
class TextTransform(Transform): def __init__(self, forward=True, seq_len=10, tokenizer='char', config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None: super().__init__(config, map_x, map_y, seq_len=seq_len, tokenizer=tokenizer, forward=forward, **kwargs) self.vocab: Vocab = None def tokenize_func(self): if self.config.tokenizer == 'char': return list elif self.config.tokenizer == 'whitespace': return lambda x: x.split() else: return lambda x: x.split(self.config.tokenizer) def fit(self, trn_path: str, **kwargs) -> int: self.vocab = Vocab() num_samples = 0 for x, y in self.file_to_inputs(trn_path): self.vocab.update(x) num_samples += 1 return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = tf.string, tf.string shapes = [None], [None] defaults = self.vocab.pad_token, self.vocab.pad_token return types, shapes, defaults def file_to_inputs(self, filepath: str, gold=True): forward = self.config.forward seq_len = self.config.seq_len buffer = [] tokenizer = self.tokenize_func() with open(filepath, encoding='utf-8') if forward else FileReadBackwards( filepath, encoding="utf-8") as src: for line in src: tokens = tokenizer(line) buffer += tokens while len(buffer) > seq_len: yield buffer[:seq_len], buffer[1:1 + seq_len] buffer.pop(0) def inputs_to_samples(self, inputs, gold=False): forward = self.config.forward for t in inputs: if gold: x, y = t else: x, y = t, t if not forward: x = list(reversed(x)) y = list(reversed(y)) yield x, y def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: return self.vocab.lookup(x) def y_to_idx(self, y) -> tf.Tensor: return self.x_to_idx(y) def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False) -> Iterable: mask = Y._keras_mask pred = tf.argmax(Y, axis=-1) for ys, ms in zip(pred, mask): ret = [] for y, m in zip(ys, ms): if not m: break ret.append(self.vocab.idx_to_token[int(y)]) yield ret def input_is_single_sample(self, input: Any) -> bool: return isinstance(input[0], str)