Пример #1
0
    def __call__(
            self, dataset: Dataset,
            generated_series: Dict[str, Iterable[Any]]) -> Iterable[List[str]]:

        source_series = generated_series.get(
            self._source_id, dataset.get_series(self._source_id))
        edits_series = generated_series.get(self._edits_id,
                                            dataset.get_series(self._edits_id))

        for src_seq, edit_seq in zip(source_series, edits_series):
            yield reconstruct(src_seq, edit_seq)
Пример #2
0
    def _do_postprocess(
            self, dataset: Dataset,
            generated_series: Dict[str, Iterable[Any]]) -> Iterable[List[str]]:

        source_series = generated_series.get(self._source_id)
        if source_series is None:
            source_series = dataset.get_series(self._source_id)
        edits_series = generated_series.get(self._edits_id)
        if edits_series is None:
            edits_series = dataset.get_series(self._edits_id)

        for src_seq, edit_seq in zip(source_series, edits_series):
            reconstructed = reconstruct(src_seq, edit_seq)
            yield reconstructed
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Populate the feed dictionary with the encoder inputs.

        Arguments:
            dataset: The dataset to use
            train: Boolean flag telling whether it is training time
        """
        # pylint: disable=invalid-name
        fd = {}  # type: FeedDict
        fd[self.train_mode] = train

        series = list(dataset.get_series(self.data_id))
        lengths = []
        inputs = []

        max_len = max(x.shape[0] for x in series)
        if self.max_input_len is not None:
            max_len = min(self.max_input_len, max_len)

        for x in series:
            length = min(max_len, x.shape[0])
            x_padded = np.zeros(shape=(max_len, ) + x.shape[1:], dtype=x.dtype)
            x_padded[:length] = x[:length]

            lengths.append(length)
            inputs.append(x_padded)

        fd[self.inputs] = inputs
        fd[self._input_lengths] = lengths

        return fd
Пример #4
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Feed the placholders with the data.

        Arguments:
            dataset: The dataset.
            train: A flag whether the train mode is enabled.

        Returns:
            The constructed feed dictionary that contains the factor data and
            the mask.
        """
        fd = {}  # type: FeedDict

        # for checking the lengths of individual factors
        arr_strings = []
        last_paddings = None

        for factor_plc, name, vocabulary in zip(
                self.input_factors, self.data_ids, self.vocabularies):
            factors = dataset.get_series(name)
            vectors, paddings = vocabulary.sentences_to_tensor(
                list(factors), self.max_length, pad_to_max_len=False,
                train_mode=train)

            fd[factor_plc] = list(zip(*vectors))

            arr_strings.append(paddings.tostring())
            last_paddings = paddings

        if len(set(arr_strings)) > 1:
            raise ValueError("The lenghts of factors do not match")

        fd[self.mask] = list(zip(*last_paddings))

        return fd
Пример #5
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Populate the feed dictionary with the encoder inputs.

        Encoder input placeholders:
            ``encoder_input``: Stores indices to the vocabulary,
                shape (batch, time)
            ``encoder_padding``: Stores the padding (ones and zeros,
                indicating valid words and positions after the end
                of sentence, shape (batch, time)
            ``train_mode``: Boolean scalar specifying the mode (train
                vs runtime)

        Arguments:
            dataset: The dataset to use
            train: Boolean flag telling whether it is training time
        """
        # pylint: disable=invalid-name
        fd = {}  # type: FeedDict
        fd[self.train_mode] = train
        sentences = dataset.get_series(self.data_id)

        vectors, paddings = self.vocabulary.sentences_to_tensor(
            list(sentences),
            self.max_input_len,
            pad_to_max_len=False,
            train_mode=train)

        # as sentences_to_tensor returns lists of shape (time, batch),
        # we need to transpose
        fd[self.inputs] = list(zip(*vectors))
        fd[self.input_mask] = list(zip(*paddings))

        return fd
Пример #6
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = {}  # type: FeedDict

        sentences = cast(Iterable[List[str]],
                         dataset.get_series(self.data_id, allow_none=True))

        fd[self.train_mode] = train

        if sentences is not None:
            vectors, paddings = self.vocabulary.sentences_to_tensor(
                list(sentences), train_mode=train)

            # sentences_to_tensor returns time-major tensors, targets need to
            # be batch-major
            vectors = vectors.T
            paddings = paddings.T

            # Need to convert the data to a sparse representation
            bool_mask = (paddings > 0.5)
            indices = np.stack(np.where(bool_mask), axis=1)
            values = vectors[bool_mask]

            fd[self.train_targets] = tf.SparseTensorValue(
                indices=indices, values=values, dense_shape=vectors.shape)

        return fd
Пример #7
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = {}  # type: FeedDict
        fd[self.train_mode] = train

        # for checking the lengths of individual factors
        arr_strings = []
        last_paddings = None

        for name, vocabulary in zip(self.data_ids, self.vocabularies):
            factors = dataset.get_series(name)
            vectors, paddings = vocabulary.sentences_to_tensor(
                list(factors), self.max_input_len, pad_to_max_len=False,
                train_mode=train)

            # pylint: disable=unsubscriptable-object
            fd[self.input_factors[name]] = list(zip(*vectors))
            # pylint: enable=unsubscriptable-object

            arr_strings.append(paddings.tostring())
            last_paddings = paddings

        if len(set(arr_strings)) > 1:
            raise ValueError("The lenghts of factors do not match")

        fd[self.input_mask] = list(zip(*last_paddings))

        return fd
Пример #8
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Populate the feed dictionary for the decoder object.

        Arguments:
            dataset: The dataset to use for the decoder.
            train: Boolean flag, telling whether this is a training run.
        """
        sentences = cast(Iterable[List[str]],
                         dataset.get_series(self.data_id, allow_none=True))

        if sentences is None and train:
            raise ValueError("When training, you must feed "
                             "reference sentences")

        sentences_list = list(sentences) if sentences is not None else None

        fd = {}  # type: FeedDict
        fd[self.train_mode] = train

        go_symbol_idx = self.vocabulary.get_word_index(START_TOKEN)
        fd[self.go_symbols] = np.full([len(dataset)], go_symbol_idx,
                                      dtype=np.int32)

        if sentences is not None:
            # train_mode=False, since we don't want to <unk>ize target words!
            inputs, weights = self.vocabulary.sentences_to_tensor(
                sentences_list, self.max_output_len, train_mode=False,
                add_start_symbol=False, add_end_symbol=True,
                pad_to_max_len=False)

            fd[self.train_inputs] = inputs
            fd[self.train_mask] = weights

        return fd
Пример #9
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        # if it is from the pickled file, it is a list, not a numpy tensor,
        # so convert it as as a prevention
        images = np.array(list(dataset.get_series(self.data_id)))
        fd[self.image_input] = images / 255.0
        return fd
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        images = np.array(dataset.get_series(self.data_id))
        assert images.shape[1:] == (self.height, self.width, 3)
        fd[self.input_image] = images

        return fd
Пример #11
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        # if it is from the pickled file, it is a list, not a numpy tensor,
        # so convert it as as a prevention
        images = np.array(list(dataset.get_series(self.data_id)))
        fd[self.image_input] = images / 255.0
        return fd
Пример #12
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        # if it is from the pickled file, it is list, not numpy tensor,
        # so convert it as as a prevention
        images = np.array(dataset.get_series(self.data_id))

        f_dict = {}
        f_dict[self.image_input] = images / 225.0

        f_dict[self.train_mode] = train
        return f_dict
Пример #13
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        res = {}  # type: FeedDict
        res[self.image_features] = dataset.get_series(self.data_id)

        if train:
            res[self.dropout_placeholder] = self.dropout_keep_prob
        else:
            res[self.dropout_placeholder] = 1.0

        return res
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Populate the feed dictionary with the encoder inputs.

        Arguments:
            dataset: The dataset to use
            train: Boolean flag telling whether it is training time
        """
        fd = ModelPart.feed_dict(self, dataset, train)
        sentences = dataset.get_series(self.data_id)
        fd[self.input_tokens] = pad_batch(list(sentences), self.max_input_len)
        return fd
Пример #15
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        sentences = cast(Iterable[List[str]],
                         dataset.get_series(self.data_id, allow_none=True))

        sentences_list = list(sentences) if sentences is not None else None

        fd = {}  # type: FeedDict
        if sentences_list is not None:
            fd[self.train_inputs] = list(zip(*sentences_list))[0]

        fd[self.train_mode] = train

        return fd
Пример #16
0
def _print_examples(dataset: Dataset,
                    outputs: Dict[str, List[Any]],
                    num_examples=15) -> None:
    """Print examples of the model output."""
    log_print(colored("Examples:", attrs=['bold']))

    # for further indexing we need to make sure, all relevant
    # dataset series are lists
    target_series = {
        series_id: list(dataset.get_series(series_id))
        for series_id in outputs.keys() if dataset.has_series(series_id)
    }
    source_series = {
        series_id: list(dataset.get_series(series_id))
        for series_id in dataset.series_ids if series_id not in outputs
    }

    for i in range(min(len(dataset), num_examples)):
        log_print(
            colored("  [{}]".format(i + 1), color='magenta', attrs=['bold']))

        def print_line(prefix, color, content):
            colored_prefix = colored(prefix, color=color)
            formated = _data_item_to_str(content)
            log_print("  {}: {}".format(colored_prefix, formated))

        for series_id, data in sorted(source_series.items(),
                                      key=lambda x: x[0]):
            print_line(series_id, 'yellow', data[i])

        for series_id, data in sorted(outputs.items(), key=lambda x: x[0]):
            model_output = data[i]
            print_line(series_id, 'magenta', model_output)

            if series_id in target_series:
                desired_output = target_series[series_id][i]
                print_line(series_id + " (ref)", "red", desired_output)
        log_print("")
Пример #17
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = {}

        alignment = dataset.get_series(self.data_id, allow_none=True)
        if alignment is None:
            if train:
                log("Warning: training alignment not present", color="red")

            alignment = np.zeros((len(dataset), self.decoder.max_output_len,
                                  self.encoder.max_input_len), np.float32)

        fd[self.ref_alignment] = alignment

        return fd
Пример #18
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        # if it is from the pickled file, it is a list, not a numpy tensor,
        # so convert it as as a prevention
        images = np.array(list(dataset.get_series(self.data_id)))

        fd[self.image_input] = images / 255.0

        # the image mask is one everywhere where the image is non-zero, i.e.
        # zero pixels are masked out
        fd[self.image_mask] = np.sign(np.sum(images, axis=3, keepdims=True))

        return fd
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = {}

        alignment = dataset.get_series(self.data_id, allow_none=True)
        if alignment is None:
            if train:
                warn("Training alignment not present!")

            alignment = np.zeros((len(dataset), self.decoder.max_output_len,
                                  self.encoder.input_sequence.max_length),
                                 np.float32)

        fd[self.ref_alignment] = alignment

        return fd
Пример #20
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = {}  # type: FeedDict

        sentences = cast(Iterable[List[str]],
                         dataset.get_series(self.data_id, allow_none=True))

        fd[self.train_mode] = train

        if sentences is not None:
            vectors, paddings = self.vocabulary.sentences_to_tensor(
                list(sentences), pad_to_max_len=False, train_mode=train)

            fd[self.train_targets] = vectors.T
            fd[self.train_weights] = paddings.T

        return fd
Пример #21
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        # if it is from the pickled file, it is list, not numpy tensor,
        # so convert it as as a prevention
        images = np.array(dataset.get_series(self.data_id))

        f_dict = {}
        f_dict[self.input_op] = images / 225.0

        # it is one everywhere where non-zero, i.e. zero columns are masked out
        f_dict[self.padding_masks] = \
            np.sum(np.sign(images), axis=3, keepdims=True)

        if train:
            f_dict[self.dropout_placeholder] = self.dropout_keep_prob
        else:
            f_dict[self.dropout_placeholder] = 1.0
        f_dict[self.is_training] = train
        return f_dict
Пример #22
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        sentences = cast(Iterable[List[str]],
                         dataset.get_series(self.data_id, allow_none=True))

        sentences_list = list(sentences) if sentences is not None else None

        fd = {}  # type: FeedDict

        label_tensors, _ = self.vocabulary.sentences_to_tensor(
            sentences_list, self.max_output_len)

        fd[self.gt_inputs[0]] = label_tensors[0]

        if train:
            fd[self.dropout_placeholder] = self.dropout_keep_prob
        else:
            fd[self.dropout_placeholder] = 1.0

        return fd
Пример #23
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        sentences = cast(Iterable[List[str]],
                         dataset.get_series(self.data_id, allow_none=True))

        sentences_list = list(sentences) if sentences is not None else None

        fd = {}  # type: FeedDict

        if sentences is not None:
            label_tensors, _ = self.vocabulary.sentences_to_tensor(
                sentences_list, self.max_output_len)

            # pylint: disable=unsubscriptable-object
            fd[self.gt_inputs[0]] = label_tensors[0]
            # pylint: enable=unsubscriptable-object

        fd[self.train_mode] = train

        return fd
Пример #24
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Populate the feed dictionary with the encoder inputs.

        Arguments:
            dataset: The dataset to use
            train: Boolean flag telling whether it is training time
        """
        fd = ModelPart.feed_dict(self, dataset, train)

        sentences = dataset.get_series(self.data_id)
        vectors, _ = self.vocabulary.sentences_to_tensor(
            list(sentences), self.max_input_len, pad_to_max_len=False,
            train_mode=train)

        # as sentences_to_tensor returns lists of shape (time, batch),
        # we need to transpose
        fd[self.inputs] = list(zip(*vectors))

        return fd
Пример #25
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Feed the placholders with the data.

        Arguments:
            dataset: The dataset.
            train: A flag whether the train mode is enabled.

        Returns:
            The constructed feed dictionary that contains the factor data and
            the mask.
        """
        fd = ModelPart.feed_dict(self, dataset, train)

        # for checking the lengths of individual factors
        for factor_plc, name in zip(self.input_factors, self.data_ids):
            sentences = dataset.get_series(name)
            fd[factor_plc] = pad_batch(
                list(sentences), self.max_length, self.add_start_symbol,
                self.add_end_symbol)

        return fd
Пример #26
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Feed the placholders with the data.

        Arguments:
            dataset: The dataset.
            train: A flag whether the train mode is enabled.

        Returns:
            The constructed feed dictionary that contains the factor data and
            the mask.
        """
        fd = ModelPart.feed_dict(self, dataset, train)

        # for checking the lengths of individual factors
        for factor_plc, name in zip(self.input_factors, self.data_ids):
            sentences = dataset.get_series(name)
            fd[factor_plc] = pad_batch(list(sentences), self.max_length,
                                       self.add_start_symbol,
                                       self.add_end_symbol)

        return fd
Пример #27
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        series = list(dataset.get_series(self.data_id))
        lengths = []
        inputs = []

        max_len = max(x.shape[0] for x in series)
        if self.max_input_len is not None:
            max_len = min(self.max_input_len, max_len)

        for x in series:
            length = min(max_len, x.shape[0])
            x_padded = np.zeros(shape=(max_len, ) + x.shape[1:], dtype=x.dtype)
            x_padded[:length] = x[:length]

            lengths.append(length)
            inputs.append(x_padded)

        fd[self.temporal_states] = inputs
        fd[self._input_lengths] = lengths

        return fd
Пример #28
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = {}  # type: FeedDict

        sentences = cast(Iterable[List[str]],
                         dataset.get_series(self.data_id, allow_none=True))

        if sentences is not None:
            sentences_list = list(sentences) if sentences is not None else None
            inputs, weights = self.vocabulary.sentences_to_tensor(
                sentences_list, self.max_output_len)

            assert len(weights) == len(self.train_weights)
            assert len(inputs) == len(self.train_targets)

            for placeholder, weight in zip(self.train_weights, weights):
                fd[placeholder] = weight

            for placeholder, tensor in zip(self.train_targets, inputs):
                fd[placeholder] = tensor

        if not train:
            fd[self.dropout_placeholder] = 1.0

        return fd
Пример #29
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        series = list(dataset.get_series(self.data_id))
        lengths = []
        inputs = []

        max_len = max(x.shape[0] for x in series)
        if self.max_input_len is not None:
            max_len = min(self.max_input_len, max_len)

        for x in series:
            length = min(max_len, x.shape[0])
            x_padded = np.zeros(shape=(max_len,) + x.shape[1:],
                                dtype=x.dtype)
            x_padded[:length] = x[:length]

            lengths.append(length)
            inputs.append(x_padded)

        fd[self.temporal_states] = inputs
        fd[self._input_lengths] = lengths

        return fd
Пример #30
0
 def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
     fd = ModelPart.feed_dict(self, dataset, train)
     fd[self.vector] = dataset.get_series(self.data_id)
     return fd
Пример #31
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        res = {}  # type: FeedDict
        res[self.image_features] = dataset.get_series(self.data_id)

        return res
Пример #32
0
 def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
     return {self.vector: dataset.get_series(self.data_id)}
Пример #33
0
    def __call__(self, dataset: Dataset) -> Iterable[List[str]]:
        source_series = dataset.get_series(self._source_id)
        target_series = dataset.get_series(self._target_id)

        for src_seq, tgt_seq in zip(source_series, target_series):
            yield convert_to_edits(src_seq, tgt_seq)
Пример #34
0
def _print_examples(dataset: Dataset,
                    outputs: Dict[str, List[Any]],
                    val_preview_input_series: Optional[List[str]] = None,
                    val_preview_output_series: Optional[List[str]] = None,
                    num_examples=15) -> None:
    """Print examples of the model output.

    Arguments:
        dataset: The dataset from which to take examples
        outputs: A mapping from the output series ID to the list of its
            contents
        val_preview_input_series: An optional list of input series to include
            in the preview. An input series is a data series that is present in
            the dataset. It can be either a target series (one that is also
            present in the outputs, i.e. reference), or a source series (one
            that is not among the outputs). In the validation preview, source
            input series and preprocessed target series are yellow and target
            (reference) series are red. If None, all series are written.
        val_preview_output_series: An optional list of output series to include
            in the preview. An output series is a data series that is present
            among the outputs. In the preview, magenta is used as the font
            color for output series
    """
    log_print(colored("Examples:", attrs=["bold"]))

    source_series_names = [s for s in dataset.series_ids if s not in outputs]
    target_series_names = [s for s in dataset.series_ids if s in outputs]
    output_series_names = list(outputs.keys())

    assert outputs

    if val_preview_input_series is not None:
        target_series_names = [
            s for s in target_series_names if s in val_preview_input_series
        ]
        source_series_names = [
            s for s in source_series_names if s in val_preview_input_series
        ]

    if val_preview_output_series is not None:
        output_series_names = [
            s for s in output_series_names if s in val_preview_output_series
        ]

    # for further indexing we need to make sure, all relevant
    # dataset series are lists
    target_series = {
        series_id: list(dataset.get_series(series_id))
        for series_id in target_series_names
    }
    source_series = {
        series_id: list(dataset.get_series(series_id))
        for series_id in source_series_names
    }

    if not isinstance(dataset, LazyDataset):
        num_examples = min(len(dataset), num_examples)

    for i in range(num_examples):
        log_print(
            colored("  [{}]".format(i + 1), color="magenta", attrs=["bold"]))

        def print_line(prefix, color, content):
            colored_prefix = colored(prefix, color=color)
            formatted = _data_item_to_str(content)
            log_print("  {}: {}".format(colored_prefix, formatted))

        # Input source series = yellow
        for series_id, data in sorted(source_series.items(),
                                      key=lambda x: x[0]):
            print_line(series_id, "yellow", data[i])

        # Output series = magenta
        for series_id in sorted(output_series_names):
            data = list(outputs[series_id])
            model_output = data[i]
            print_line(series_id, "magenta", model_output)

        # Input target series (a.k.a. references) = red
        for series_id in sorted(target_series_names):
            data = outputs[series_id]
            desired_output = target_series[series_id][i]
            print_line(series_id + " (ref)", "red", desired_output)

        log_print("")
Пример #35
0
 def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
     fd = ModelPart.feed_dict(self, dataset, train)
     fd[self.spatial_input] = list(dataset.get_series(self.data_id))
     return fd