def __call__( self, dataset: Dataset, generated_series: Dict[str, Iterable[Any]]) -> Iterable[List[str]]: source_series = generated_series.get( self._source_id, dataset.get_series(self._source_id)) edits_series = generated_series.get(self._edits_id, dataset.get_series(self._edits_id)) for src_seq, edit_seq in zip(source_series, edits_series): yield reconstruct(src_seq, edit_seq)
def _do_postprocess( self, dataset: Dataset, generated_series: Dict[str, Iterable[Any]]) -> Iterable[List[str]]: source_series = generated_series.get(self._source_id) if source_series is None: source_series = dataset.get_series(self._source_id) edits_series = generated_series.get(self._edits_id) if edits_series is None: edits_series = dataset.get_series(self._edits_id) for src_seq, edit_seq in zip(source_series, edits_series): reconstructed = reconstruct(src_seq, edit_seq) yield reconstructed
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Populate the feed dictionary with the encoder inputs. Arguments: dataset: The dataset to use train: Boolean flag telling whether it is training time """ # pylint: disable=invalid-name fd = {} # type: FeedDict fd[self.train_mode] = train series = list(dataset.get_series(self.data_id)) lengths = [] inputs = [] max_len = max(x.shape[0] for x in series) if self.max_input_len is not None: max_len = min(self.max_input_len, max_len) for x in series: length = min(max_len, x.shape[0]) x_padded = np.zeros(shape=(max_len, ) + x.shape[1:], dtype=x.dtype) x_padded[:length] = x[:length] lengths.append(length) inputs.append(x_padded) fd[self.inputs] = inputs fd[self._input_lengths] = lengths return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Feed the placholders with the data. Arguments: dataset: The dataset. train: A flag whether the train mode is enabled. Returns: The constructed feed dictionary that contains the factor data and the mask. """ fd = {} # type: FeedDict # for checking the lengths of individual factors arr_strings = [] last_paddings = None for factor_plc, name, vocabulary in zip( self.input_factors, self.data_ids, self.vocabularies): factors = dataset.get_series(name) vectors, paddings = vocabulary.sentences_to_tensor( list(factors), self.max_length, pad_to_max_len=False, train_mode=train) fd[factor_plc] = list(zip(*vectors)) arr_strings.append(paddings.tostring()) last_paddings = paddings if len(set(arr_strings)) > 1: raise ValueError("The lenghts of factors do not match") fd[self.mask] = list(zip(*last_paddings)) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Populate the feed dictionary with the encoder inputs. Encoder input placeholders: ``encoder_input``: Stores indices to the vocabulary, shape (batch, time) ``encoder_padding``: Stores the padding (ones and zeros, indicating valid words and positions after the end of sentence, shape (batch, time) ``train_mode``: Boolean scalar specifying the mode (train vs runtime) Arguments: dataset: The dataset to use train: Boolean flag telling whether it is training time """ # pylint: disable=invalid-name fd = {} # type: FeedDict fd[self.train_mode] = train sentences = dataset.get_series(self.data_id) vectors, paddings = self.vocabulary.sentences_to_tensor( list(sentences), self.max_input_len, pad_to_max_len=False, train_mode=train) # as sentences_to_tensor returns lists of shape (time, batch), # we need to transpose fd[self.inputs] = list(zip(*vectors)) fd[self.input_mask] = list(zip(*paddings)) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = {} # type: FeedDict sentences = cast(Iterable[List[str]], dataset.get_series(self.data_id, allow_none=True)) fd[self.train_mode] = train if sentences is not None: vectors, paddings = self.vocabulary.sentences_to_tensor( list(sentences), train_mode=train) # sentences_to_tensor returns time-major tensors, targets need to # be batch-major vectors = vectors.T paddings = paddings.T # Need to convert the data to a sparse representation bool_mask = (paddings > 0.5) indices = np.stack(np.where(bool_mask), axis=1) values = vectors[bool_mask] fd[self.train_targets] = tf.SparseTensorValue( indices=indices, values=values, dense_shape=vectors.shape) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = {} # type: FeedDict fd[self.train_mode] = train # for checking the lengths of individual factors arr_strings = [] last_paddings = None for name, vocabulary in zip(self.data_ids, self.vocabularies): factors = dataset.get_series(name) vectors, paddings = vocabulary.sentences_to_tensor( list(factors), self.max_input_len, pad_to_max_len=False, train_mode=train) # pylint: disable=unsubscriptable-object fd[self.input_factors[name]] = list(zip(*vectors)) # pylint: enable=unsubscriptable-object arr_strings.append(paddings.tostring()) last_paddings = paddings if len(set(arr_strings)) > 1: raise ValueError("The lenghts of factors do not match") fd[self.input_mask] = list(zip(*last_paddings)) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Populate the feed dictionary for the decoder object. Arguments: dataset: The dataset to use for the decoder. train: Boolean flag, telling whether this is a training run. """ sentences = cast(Iterable[List[str]], dataset.get_series(self.data_id, allow_none=True)) if sentences is None and train: raise ValueError("When training, you must feed " "reference sentences") sentences_list = list(sentences) if sentences is not None else None fd = {} # type: FeedDict fd[self.train_mode] = train go_symbol_idx = self.vocabulary.get_word_index(START_TOKEN) fd[self.go_symbols] = np.full([len(dataset)], go_symbol_idx, dtype=np.int32) if sentences is not None: # train_mode=False, since we don't want to <unk>ize target words! inputs, weights = self.vocabulary.sentences_to_tensor( sentences_list, self.max_output_len, train_mode=False, add_start_symbol=False, add_end_symbol=True, pad_to_max_len=False) fd[self.train_inputs] = inputs fd[self.train_mask] = weights return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) # if it is from the pickled file, it is a list, not a numpy tensor, # so convert it as as a prevention images = np.array(list(dataset.get_series(self.data_id))) fd[self.image_input] = images / 255.0 return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) images = np.array(dataset.get_series(self.data_id)) assert images.shape[1:] == (self.height, self.width, 3) fd[self.input_image] = images return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: # if it is from the pickled file, it is list, not numpy tensor, # so convert it as as a prevention images = np.array(dataset.get_series(self.data_id)) f_dict = {} f_dict[self.image_input] = images / 225.0 f_dict[self.train_mode] = train return f_dict
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: res = {} # type: FeedDict res[self.image_features] = dataset.get_series(self.data_id) if train: res[self.dropout_placeholder] = self.dropout_keep_prob else: res[self.dropout_placeholder] = 1.0 return res
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Populate the feed dictionary with the encoder inputs. Arguments: dataset: The dataset to use train: Boolean flag telling whether it is training time """ fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.get_series(self.data_id) fd[self.input_tokens] = pad_batch(list(sentences), self.max_input_len) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: sentences = cast(Iterable[List[str]], dataset.get_series(self.data_id, allow_none=True)) sentences_list = list(sentences) if sentences is not None else None fd = {} # type: FeedDict if sentences_list is not None: fd[self.train_inputs] = list(zip(*sentences_list))[0] fd[self.train_mode] = train return fd
def _print_examples(dataset: Dataset, outputs: Dict[str, List[Any]], num_examples=15) -> None: """Print examples of the model output.""" log_print(colored("Examples:", attrs=['bold'])) # for further indexing we need to make sure, all relevant # dataset series are lists target_series = { series_id: list(dataset.get_series(series_id)) for series_id in outputs.keys() if dataset.has_series(series_id) } source_series = { series_id: list(dataset.get_series(series_id)) for series_id in dataset.series_ids if series_id not in outputs } for i in range(min(len(dataset), num_examples)): log_print( colored(" [{}]".format(i + 1), color='magenta', attrs=['bold'])) def print_line(prefix, color, content): colored_prefix = colored(prefix, color=color) formated = _data_item_to_str(content) log_print(" {}: {}".format(colored_prefix, formated)) for series_id, data in sorted(source_series.items(), key=lambda x: x[0]): print_line(series_id, 'yellow', data[i]) for series_id, data in sorted(outputs.items(), key=lambda x: x[0]): model_output = data[i] print_line(series_id, 'magenta', model_output) if series_id in target_series: desired_output = target_series[series_id][i] print_line(series_id + " (ref)", "red", desired_output) log_print("")
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = {} alignment = dataset.get_series(self.data_id, allow_none=True) if alignment is None: if train: log("Warning: training alignment not present", color="red") alignment = np.zeros((len(dataset), self.decoder.max_output_len, self.encoder.max_input_len), np.float32) fd[self.ref_alignment] = alignment return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) # if it is from the pickled file, it is a list, not a numpy tensor, # so convert it as as a prevention images = np.array(list(dataset.get_series(self.data_id))) fd[self.image_input] = images / 255.0 # the image mask is one everywhere where the image is non-zero, i.e. # zero pixels are masked out fd[self.image_mask] = np.sign(np.sum(images, axis=3, keepdims=True)) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = {} alignment = dataset.get_series(self.data_id, allow_none=True) if alignment is None: if train: warn("Training alignment not present!") alignment = np.zeros((len(dataset), self.decoder.max_output_len, self.encoder.input_sequence.max_length), np.float32) fd[self.ref_alignment] = alignment return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = {} # type: FeedDict sentences = cast(Iterable[List[str]], dataset.get_series(self.data_id, allow_none=True)) fd[self.train_mode] = train if sentences is not None: vectors, paddings = self.vocabulary.sentences_to_tensor( list(sentences), pad_to_max_len=False, train_mode=train) fd[self.train_targets] = vectors.T fd[self.train_weights] = paddings.T return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: # if it is from the pickled file, it is list, not numpy tensor, # so convert it as as a prevention images = np.array(dataset.get_series(self.data_id)) f_dict = {} f_dict[self.input_op] = images / 225.0 # it is one everywhere where non-zero, i.e. zero columns are masked out f_dict[self.padding_masks] = \ np.sum(np.sign(images), axis=3, keepdims=True) if train: f_dict[self.dropout_placeholder] = self.dropout_keep_prob else: f_dict[self.dropout_placeholder] = 1.0 f_dict[self.is_training] = train return f_dict
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: sentences = cast(Iterable[List[str]], dataset.get_series(self.data_id, allow_none=True)) sentences_list = list(sentences) if sentences is not None else None fd = {} # type: FeedDict label_tensors, _ = self.vocabulary.sentences_to_tensor( sentences_list, self.max_output_len) fd[self.gt_inputs[0]] = label_tensors[0] if train: fd[self.dropout_placeholder] = self.dropout_keep_prob else: fd[self.dropout_placeholder] = 1.0 return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: sentences = cast(Iterable[List[str]], dataset.get_series(self.data_id, allow_none=True)) sentences_list = list(sentences) if sentences is not None else None fd = {} # type: FeedDict if sentences is not None: label_tensors, _ = self.vocabulary.sentences_to_tensor( sentences_list, self.max_output_len) # pylint: disable=unsubscriptable-object fd[self.gt_inputs[0]] = label_tensors[0] # pylint: enable=unsubscriptable-object fd[self.train_mode] = train return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Populate the feed dictionary with the encoder inputs. Arguments: dataset: The dataset to use train: Boolean flag telling whether it is training time """ fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.get_series(self.data_id) vectors, _ = self.vocabulary.sentences_to_tensor( list(sentences), self.max_input_len, pad_to_max_len=False, train_mode=train) # as sentences_to_tensor returns lists of shape (time, batch), # we need to transpose fd[self.inputs] = list(zip(*vectors)) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Feed the placholders with the data. Arguments: dataset: The dataset. train: A flag whether the train mode is enabled. Returns: The constructed feed dictionary that contains the factor data and the mask. """ fd = ModelPart.feed_dict(self, dataset, train) # for checking the lengths of individual factors for factor_plc, name in zip(self.input_factors, self.data_ids): sentences = dataset.get_series(name) fd[factor_plc] = pad_batch( list(sentences), self.max_length, self.add_start_symbol, self.add_end_symbol) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Feed the placholders with the data. Arguments: dataset: The dataset. train: A flag whether the train mode is enabled. Returns: The constructed feed dictionary that contains the factor data and the mask. """ fd = ModelPart.feed_dict(self, dataset, train) # for checking the lengths of individual factors for factor_plc, name in zip(self.input_factors, self.data_ids): sentences = dataset.get_series(name) fd[factor_plc] = pad_batch(list(sentences), self.max_length, self.add_start_symbol, self.add_end_symbol) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) series = list(dataset.get_series(self.data_id)) lengths = [] inputs = [] max_len = max(x.shape[0] for x in series) if self.max_input_len is not None: max_len = min(self.max_input_len, max_len) for x in series: length = min(max_len, x.shape[0]) x_padded = np.zeros(shape=(max_len, ) + x.shape[1:], dtype=x.dtype) x_padded[:length] = x[:length] lengths.append(length) inputs.append(x_padded) fd[self.temporal_states] = inputs fd[self._input_lengths] = lengths return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = {} # type: FeedDict sentences = cast(Iterable[List[str]], dataset.get_series(self.data_id, allow_none=True)) if sentences is not None: sentences_list = list(sentences) if sentences is not None else None inputs, weights = self.vocabulary.sentences_to_tensor( sentences_list, self.max_output_len) assert len(weights) == len(self.train_weights) assert len(inputs) == len(self.train_targets) for placeholder, weight in zip(self.train_weights, weights): fd[placeholder] = weight for placeholder, tensor in zip(self.train_targets, inputs): fd[placeholder] = tensor if not train: fd[self.dropout_placeholder] = 1.0 return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) series = list(dataset.get_series(self.data_id)) lengths = [] inputs = [] max_len = max(x.shape[0] for x in series) if self.max_input_len is not None: max_len = min(self.max_input_len, max_len) for x in series: length = min(max_len, x.shape[0]) x_padded = np.zeros(shape=(max_len,) + x.shape[1:], dtype=x.dtype) x_padded[:length] = x[:length] lengths.append(length) inputs.append(x_padded) fd[self.temporal_states] = inputs fd[self._input_lengths] = lengths return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) fd[self.vector] = dataset.get_series(self.data_id) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: res = {} # type: FeedDict res[self.image_features] = dataset.get_series(self.data_id) return res
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: return {self.vector: dataset.get_series(self.data_id)}
def __call__(self, dataset: Dataset) -> Iterable[List[str]]: source_series = dataset.get_series(self._source_id) target_series = dataset.get_series(self._target_id) for src_seq, tgt_seq in zip(source_series, target_series): yield convert_to_edits(src_seq, tgt_seq)
def _print_examples(dataset: Dataset, outputs: Dict[str, List[Any]], val_preview_input_series: Optional[List[str]] = None, val_preview_output_series: Optional[List[str]] = None, num_examples=15) -> None: """Print examples of the model output. Arguments: dataset: The dataset from which to take examples outputs: A mapping from the output series ID to the list of its contents val_preview_input_series: An optional list of input series to include in the preview. An input series is a data series that is present in the dataset. It can be either a target series (one that is also present in the outputs, i.e. reference), or a source series (one that is not among the outputs). In the validation preview, source input series and preprocessed target series are yellow and target (reference) series are red. If None, all series are written. val_preview_output_series: An optional list of output series to include in the preview. An output series is a data series that is present among the outputs. In the preview, magenta is used as the font color for output series """ log_print(colored("Examples:", attrs=["bold"])) source_series_names = [s for s in dataset.series_ids if s not in outputs] target_series_names = [s for s in dataset.series_ids if s in outputs] output_series_names = list(outputs.keys()) assert outputs if val_preview_input_series is not None: target_series_names = [ s for s in target_series_names if s in val_preview_input_series ] source_series_names = [ s for s in source_series_names if s in val_preview_input_series ] if val_preview_output_series is not None: output_series_names = [ s for s in output_series_names if s in val_preview_output_series ] # for further indexing we need to make sure, all relevant # dataset series are lists target_series = { series_id: list(dataset.get_series(series_id)) for series_id in target_series_names } source_series = { series_id: list(dataset.get_series(series_id)) for series_id in source_series_names } if not isinstance(dataset, LazyDataset): num_examples = min(len(dataset), num_examples) for i in range(num_examples): log_print( colored(" [{}]".format(i + 1), color="magenta", attrs=["bold"])) def print_line(prefix, color, content): colored_prefix = colored(prefix, color=color) formatted = _data_item_to_str(content) log_print(" {}: {}".format(colored_prefix, formatted)) # Input source series = yellow for series_id, data in sorted(source_series.items(), key=lambda x: x[0]): print_line(series_id, "yellow", data[i]) # Output series = magenta for series_id in sorted(output_series_names): data = list(outputs[series_id]) model_output = data[i] print_line(series_id, "magenta", model_output) # Input target series (a.k.a. references) = red for series_id in sorted(target_series_names): data = outputs[series_id] desired_output = target_series[series_id][i] print_line(series_id + " (ref)", "red", desired_output) log_print("")
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) fd[self.spatial_input] = list(dataset.get_series(self.data_id)) return fd