예제 #1
0
    def test_buckets_similar_size(self):
        # testing dataset is 3 x 6 sequences of lengths 0 - 5
        iterators = {
            "sentences":
            lambda: [["word" for _ in range(l)] for l in range(6)] * 3
        }

        dataset = Dataset("dataset", iterators=iterators, shuffled=True)

        # we use batch size 6 and bucket span 2
        scheme = BatchingScheme(6, 2, False, None)

        # we process the dataset in two epochs and save what did the batches
        # look like
        batches = []
        for batch in dataset.batches(scheme):
            batches.append(list(batch.get_series("sentences")))

        # this setup should divide the data to 3 batches
        self.assertEqual(len(batches), 3)

        for batch in batches:
            # each batch should contain 6 values
            self.assertEqual(len(batch), 6)

            lengths = set(len(b) for b in batch)

            # the values in the batch should have two lengths
            self.assertEqual(len(lengths), 2)

            # the lengths should differ by one
            self.assertEqual(max(lengths) - min(lengths), 1)
예제 #2
0
    def test_batching_lazy_shuffle(self):
        iterators = {"a": lambda: range(5), "b": lambda: range(5, 10)}

        dataset = Dataset("dataset",
                          iterators=iterators,
                          shuffled=True,
                          buffer_size=(3, 5))

        batches = []
        for epoch in range(2):
            epoch = []
            for batch in dataset.batches(DEFAULT_BATCHING_SCHEME):
                epoch.append({s: list(batch.get_series(s)) for s in iterators})

            batches.append(epoch)

        epoch_data = []
        epoch_data.append(
            [c for batch in batches[0] for b in batch.values() for c in b])
        epoch_data.append(
            [c for batch in batches[1] for b in batch.values() for c in b])

        self.assertEqual(set(epoch_data[0]), set(range(10)))
        self.assertEqual(set(epoch_data[0]), set(epoch_data[1]))
        self.assertNotEqual(epoch_data[0], epoch_data[1])
예제 #3
0
    def test_bucketing_no_leftovers(self):

        # testing dataset is 50 sequences of lengths 1 - 50
        iterators = {
            "sentences": lambda:
            (["word" for _ in range(l)] for l in range(1, 50))
        }

        dataset = Dataset("dataset", iterators=iterators, shuffled=False)

        # we use batch size 7 and bucket span 10
        scheme = BatchingScheme(7, 10, False, None, False)

        # we process the dataset in two epochs and save what did the batches
        # look like
        batches = []
        for batch in dataset.batches(scheme):
            batches.append(list(batch.get_series("sentences")))

        ref_batches = [[["word" for _ in range(l)] for l in range(1, 8)],
                       [["word" for _ in range(l)] for l in range(10, 17)],
                       [["word" for _ in range(l)] for l in range(20, 27)],
                       [["word" for _ in range(l)] for l in range(30, 37)],
                       [["word" for _ in range(l)] for l in range(40, 47)]]

        self.assertSequenceEqual(ref_batches, batches)
예제 #4
0
    def test_bucketing(self):

        # testing dataset is 50 sequences of lengths 1 - 50
        iterators = {
            "sentences": lambda:
            (["word" for _ in range(l)] for l in range(1, 50))
        }

        # we use batch size 7 and bucket span 10
        scheme = BatchingScheme(bucket_boundaries=[9, 19, 29, 39, 49],
                                bucket_batch_sizes=[7, 7, 7, 7, 7, 7])

        dataset = Dataset("dataset",
                          iterators=iterators,
                          batching=scheme,
                          shuffled=False)

        # we process the dataset in two epochs and save what did the batches
        # look like
        batches = []
        for batch in dataset.batches():
            batches.append(list(batch.get_series("sentences")))

        ref_batches = [[["word" for _ in range(l)] for l in range(1, 8)],
                       [["word" for _ in range(l)] for l in range(10, 17)],
                       [["word" for _ in range(l)] for l in range(20, 27)],
                       [["word" for _ in range(l)] for l in range(30, 37)],
                       [["word" for _ in range(l)] for l in range(40, 47)],
                       [["word" for _ in range(l)] for l in range(8, 10)],
                       [["word" for _ in range(l)] for l in range(17, 20)],
                       [["word" for _ in range(l)] for l in range(27, 30)],
                       [["word" for _ in range(l)] for l in range(37, 40)],
                       [["word" for _ in range(l)] for l in range(47, 50)]]

        self.assertSequenceEqual(ref_batches, batches)
예제 #5
0
    def test_batching_lazy_noshuffle(self):
        iterators = {"a": lambda: range(5), "b": lambda: range(10, 15)}

        dataset = Dataset("dataset",
                          iterators=iterators,
                          shuffled=False,
                          buffer_size=(3, 5))

        batches = []
        for epoch in range(2):
            epoch = []
            for batch in dataset.batches(DEFAULT_BATCHING_SCHEME):
                epoch.append({s: list(batch.get_series(s)) for s in iterators})

            batches.append(epoch)

        self.assertEqual(batches, [[{
            "a": [0, 1, 2],
            "b": [10, 11, 12]
        }, {
            "a": [3, 4],
            "b": [13, 14]
        }], [{
            "a": [0, 1, 2],
            "b": [10, 11, 12]
        }, {
            "a": [3, 4],
            "b": [13, 14]
        }]])
예제 #6
0
    def __call__(
            self, dataset: Dataset,
            generated_series: Dict[str, Iterable[Any]]) -> Iterable[List[str]]:

        source_series = generated_series.get(
            self._source_id, dataset.get_series(self._source_id))
        edits_series = generated_series.get(self._edits_id,
                                            dataset.get_series(self._edits_id))

        for src_seq, edit_seq in zip(source_series, edits_series):
            yield reconstruct(src_seq, edit_seq)
예제 #7
0
    def _do_postprocess(
            self, dataset: Dataset,
            generated_series: Dict[str, Iterable[Any]]) -> Iterable[List[str]]:

        source_series = generated_series.get(self._source_id)
        if source_series is None:
            source_series = dataset.get_series(self._source_id)
        edits_series = generated_series.get(self._edits_id)
        if edits_series is None:
            edits_series = dataset.get_series(self._edits_id)

        for src_seq, edit_seq in zip(source_series, edits_series):
            reconstructed = reconstruct(src_seq, edit_seq)
            yield reconstructed
예제 #8
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        sentences = cast(Iterable[List[str]],
                         dataset.maybe_get_series(self.data_id))

        if sentences is None and train:
            raise ValueError("When training, you must feed "
                             "reference sentences")

        if sentences is not None:
            vectors, paddings = self.vocabulary.sentences_to_tensor(
                list(sentences), train_mode=train, max_len=self.max_length)

            # sentences_to_tensor returns time-major tensors, targets need to
            # be batch-major
            vectors = vectors.T
            paddings = paddings.T

            # Need to convert the data to a sparse representation
            bool_mask = (paddings > 0.5)
            indices = np.stack(np.where(bool_mask), axis=1)
            values = vectors[bool_mask]

            fd[self.train_targets] = tf.SparseTensorValue(
                indices=indices, values=values, dense_shape=vectors.shape)

        return fd
예제 #9
0
def run_on_dataset(tf_manager: TensorFlowManager,
                   runners: List[BaseRunner],
                   dataset: Dataset,
                   postprocess: Callable,
                   write_out: bool=False,
                   batch_size: Optional[int]=None) \
                                                -> Tuple[List[ExecutionResult],
                                                         Dict[str, List[Any]]]:
    """Apply the model on a dataset and optionally write outputs to files.

    Args:
        tf_manager: TensorFlow manager with initialized sessions.
        runners: A function that runs the code
        dataset: The dataset on which the model will be executed.
        evaluators: List of evaluators that are used for the model
            evaluation if the target data are provided.
        postprocess: an object to use as postprocessing of the
        write_out: Flag whether the outputs should be printed to a file defined
            in the dataset object.

        extra_fetches: Extra tensors to evaluate for each batch.

    Returns:
        Tuple of resulting sentences/numpy arrays, and evaluation results if
        they are available which are dictionary function -> value.

    """

    contains_targets = all(dataset.has_series(runner.output_series)
                           for runner in runners)

    all_results = tf_manager.execute(dataset, runners,
                                     train=contains_targets,
                                     batch_size=batch_size)

    result_data_raw = {runner.output_series: result.outputs
                       for runner, result in zip(runners, all_results)}

    if postprocess is not None:
        result_data = postprocess(dataset, result_data_raw)
    else:
        result_data = result_data_raw

    if write_out:
        for series_id, data in result_data.items():
            if series_id in dataset.series_outputs:
                path = dataset.series_outputs[series_id]
                if isinstance(data, np.ndarray):
                    np.save(path, data)
                    log('Result saved as numpy array to "{}"'.format(path))
                else:
                    with open(path, 'w') as f_out:
                        f_out.writelines(
                            [" ".join(sent) + "\n" for sent in data])
                    log("Result saved as plain text \"{}\"".format(path))
            else:
                log("There is no output file for dataset: {}"
                    .format(dataset.name), color='red')

    return all_results, result_data
예제 #10
0
def dataset_from_files(**kwargs):
    """
    Creates a dataset from the provided arguments. Paths to the data are
    provided in a form of dictionary.

    Args:

        kwargs: Arguments are treated as a dictionary. Paths to the data
            series are specified here. Series identifiers should not contain
            underscores. You can specify a language for the serie by adding
            a preprocess method you want to apply on the textual data by
            naming the function as <identifier>_preprocess=function
            OR the preprocessor can be specified globally

    """

    random_seed = kwargs.get('random_seed', None)
    preprocess = kwargs.get('preprocessor', lambda x: x)

    series_paths = _get_series_paths(kwargs)

    if len(series_paths) > 0:
        log("Initializing dataset with: {}".format(", ".join(series_paths)))
        series = {s: Dataset.create_series(series_paths[s], preprocess)
                  for s in series_paths}
        name = kwargs.get('name', _get_name_from_paths(series_paths))

    series_outputs = {SERIES_OUTPUT.match(key)[1]: value
                      for key, value in kwargs.items()
                      if SERIES_OUTPUT.match(key)}

    dataset = Dataset(name, series, series_outputs, random_seed)
    log("Dataset length: {}".format(len(dataset)))
    return dataset
예제 #11
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Populate the feed dictionary for the decoder object.

        Arguments:
            dataset: The dataset to use for the decoder.
            train: Boolean flag, telling whether this is a training run.
        """
        fd = ModelPart.feed_dict(self, dataset, train)

        sentences = dataset.maybe_get_series(self.data_id)

        if sentences is None and train:
            raise ValueError("When training, you must feed "
                             "reference sentences")

        go_symbol_idx = self.vocabulary.get_word_index(START_TOKEN)
        fd[self.go_symbols] = np.full([len(dataset)],
                                      go_symbol_idx,
                                      dtype=np.int32)

        if sentences is not None:
            sentences_list = list(sentences)
            # train_mode=False, since we don't want to <unk>ize target words!
            inputs, _ = self.vocabulary.sentences_to_tensor(
                sentences_list,
                self.max_output_len,
                train_mode=False,
                add_start_symbol=False,
                add_end_symbol=True,
                pad_to_max_len=False)

            fd[self.train_inputs] = inputs

        return fd
예제 #12
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = {}  # type: FeedDict

        sentences = cast(Iterable[List[str]],
                         dataset.get_series(self.data_id, allow_none=True))

        fd[self.train_mode] = train

        if sentences is not None:
            vectors, paddings = self.vocabulary.sentences_to_tensor(
                list(sentences), train_mode=train)

            # sentences_to_tensor returns time-major tensors, targets need to
            # be batch-major
            vectors = vectors.T
            paddings = paddings.T

            # Need to convert the data to a sparse representation
            bool_mask = (paddings > 0.5)
            indices = np.stack(np.where(bool_mask), axis=1)
            values = vectors[bool_mask]

            fd[self.train_targets] = tf.SparseTensorValue(
                indices=indices, values=values, dense_shape=vectors.shape)

        return fd
예제 #13
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = {}  # type: FeedDict
        fd[self.train_mode] = train

        # for checking the lengths of individual factors
        arr_strings = []
        last_paddings = None

        for name, vocabulary in zip(self.data_ids, self.vocabularies):
            factors = dataset.get_series(name)
            vectors, paddings = vocabulary.sentences_to_tensor(
                list(factors), self.max_input_len, pad_to_max_len=False,
                train_mode=train)

            # pylint: disable=unsubscriptable-object
            fd[self.input_factors[name]] = list(zip(*vectors))
            # pylint: enable=unsubscriptable-object

            arr_strings.append(paddings.tostring())
            last_paddings = paddings

        if len(set(arr_strings)) > 1:
            raise ValueError("The lenghts of factors do not match")

        fd[self.input_mask] = list(zip(*last_paddings))

        return fd
예제 #14
0
    def execute(self,
                dataset: Dataset,
                execution_scripts,
                train=False,
                compute_losses=True,
                summaries=True,
                batch_size=None,
                log_progress: int = 0) -> List[ExecutionResult]:
        if batch_size is None:
            batch_size = len(dataset)
        batched_dataset = dataset.batch_dataset(batch_size)
        last_log_time = time.process_time()

        batch_results = [
            [] for _ in execution_scripts]  # type: List[List[ExecutionResult]]
        for batch_id, batch in enumerate(batched_dataset):
            if 0 < log_progress < time.process_time() - last_log_time:
                log("Processed {} examples.".format(batch_id * batch_size))
                last_log_time = time.process_time()
            executables = [s.get_executable(compute_losses=compute_losses,
                                            summaries=summaries,
                                            num_sessions=len(self.sessions))
                           for s in execution_scripts]

            while not all(ex.result is not None for ex in executables):
                self._run_executables(batch, executables, train)

            for script_list, executable in zip(batch_results, executables):
                script_list.append(executable.result)

        collected_results = []  # type: List[ExecutionResult]
        for result_list in batch_results:
            collected_results.append(reduce_execution_results(result_list))

        return collected_results
예제 #15
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Populate the feed dictionary with the encoder inputs.

        Encoder input placeholders:
            ``encoder_input``: Stores indices to the vocabulary,
                shape (batch, time)
            ``encoder_padding``: Stores the padding (ones and zeros,
                indicating valid words and positions after the end
                of sentence, shape (batch, time)
            ``train_mode``: Boolean scalar specifying the mode (train
                vs runtime)

        Arguments:
            dataset: The dataset to use
            train: Boolean flag telling whether it is training time
        """
        # pylint: disable=invalid-name
        fd = {}  # type: FeedDict
        fd[self.train_mode] = train
        sentences = dataset.get_series(self.data_id)

        vectors, paddings = self.vocabulary.sentences_to_tensor(
            list(sentences),
            self.max_input_len,
            pad_to_max_len=False,
            train_mode=train)

        # as sentences_to_tensor returns lists of shape (time, batch),
        # we need to transpose
        fd[self.inputs] = list(zip(*vectors))
        fd[self.input_mask] = list(zip(*paddings))

        return fd
예제 #16
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Populate the feed dictionary for the decoder object.

        Arguments:
            dataset: The dataset to use for the decoder.
            train: Boolean flag, telling whether this is a training run.
        """
        sentences = cast(Iterable[List[str]],
                         dataset.get_series(self.data_id, allow_none=True))

        if sentences is None and train:
            raise ValueError("When training, you must feed "
                             "reference sentences")

        sentences_list = list(sentences) if sentences is not None else None

        fd = {}  # type: FeedDict
        fd[self.train_mode] = train

        go_symbol_idx = self.vocabulary.get_word_index(START_TOKEN)
        fd[self.go_symbols] = np.full([len(dataset)], go_symbol_idx,
                                      dtype=np.int32)

        if sentences is not None:
            # train_mode=False, since we don't want to <unk>ize target words!
            inputs, weights = self.vocabulary.sentences_to_tensor(
                sentences_list, self.max_output_len, train_mode=False,
                add_start_symbol=False, add_end_symbol=True,
                pad_to_max_len=False)

            fd[self.train_inputs] = inputs
            fd[self.train_mask] = weights

        return fd
예제 #17
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        sentences = cast(Iterable[List[str]],
                         dataset.maybe_get_series(self.data_id))

        if sentences is None and train:
            raise ValueError("When training, you must feed "
                             "reference sentences")

        if sentences is not None:
            vectors, paddings = self.vocabulary.sentences_to_tensor(
                list(sentences), train_mode=train, max_len=self.max_length)

            # sentences_to_tensor returns time-major tensors, targets need to
            # be batch-major
            vectors = vectors.T
            paddings = paddings.T

            bool_mask = (paddings > 0.5)
            flat_labels = vectors[bool_mask]
            label_lengths = bool_mask.sum(axis=1)

            fd[self.label_lengths] = label_lengths
            fd[self.flat_labels] = flat_labels

        return fd
예제 #18
0
def check_dataset_and_coders(dataset: Dataset,
                             runners: Iterable[BaseRunner]) -> None:
    # pylint: disable=protected-access

    data_list = []
    for runner in runners:
        for c in runner.all_coders:
            if hasattr(c, "data_id"):
                data_list.append((c.data_id, c))
            elif hasattr(c, "data_ids"):
                data_list.extend([(d, c) for d in c.data_ids])
            else:
                log(("Coder: {} does not have " "a data attribute.").format(c))

    debug("Found series: {}".format(str(data_list)), "checking")
    missing = []

    for (serie, coder) in data_list:
        if not dataset.has_series(serie):
            log("dataset {} does not have serie {}".format(
                dataset.name, serie))
            missing.append((coder, serie))

    if missing:
        formated = [
            "{} ({}, {}.{})".format(serie, cod.name, cod.__class__.__module__,
                                    cod.__class__.__name__)
            for cod, serie in missing
        ]

        raise CheckingException("Dataset '{}' is mising series {}:".format(
            dataset.name, ", ".join(formated)))
예제 #19
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Feed the placholders with the data.

        Arguments:
            dataset: The dataset.
            train: A flag whether the train mode is enabled.

        Returns:
            The constructed feed dictionary that contains the factor data and
            the mask.
        """
        fd = {}  # type: FeedDict

        # for checking the lengths of individual factors
        arr_strings = []
        last_paddings = None

        for factor_plc, name, vocabulary in zip(
                self.input_factors, self.data_ids, self.vocabularies):
            factors = dataset.get_series(name)
            vectors, paddings = vocabulary.sentences_to_tensor(
                list(factors), self.max_length, pad_to_max_len=False,
                train_mode=train)

            fd[factor_plc] = list(zip(*vectors))

            arr_strings.append(paddings.tostring())
            last_paddings = paddings

        if len(set(arr_strings)) > 1:
            raise ValueError("The lenghts of factors do not match")

        fd[self.mask] = list(zip(*last_paddings))

        return fd
예제 #20
0
def run(data):  # pragma: no cover
    exp = APP.config["experiment"]
    dataset = Dataset("request", data, {})

    _, response_data = exp.run_model(dataset, write_out=False)

    return response_data
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Populate the feed dictionary with the encoder inputs.

        Arguments:
            dataset: The dataset to use
            train: Boolean flag telling whether it is training time
        """
        # pylint: disable=invalid-name
        fd = {}  # type: FeedDict
        fd[self.train_mode] = train

        series = list(dataset.get_series(self.data_id))
        lengths = []
        inputs = []

        max_len = max(x.shape[0] for x in series)
        if self.max_input_len is not None:
            max_len = min(self.max_input_len, max_len)

        for x in series:
            length = min(max_len, x.shape[0])
            x_padded = np.zeros(shape=(max_len, ) + x.shape[1:], dtype=x.dtype)
            x_padded[:length] = x[:length]

            lengths.append(length)
            inputs.append(x_padded)

        fd[self.inputs] = inputs
        fd[self._input_lengths] = lengths

        return fd
예제 #22
0
def post_request():
    start_time = datetime.datetime.now()
    request_data = request.get_json()

    if request_data is None:
        response_data = {"error": "No data were provided."}
        code = 400
    else:
        args = APP.config['args']

        try:
            dataset = Dataset("request", request_data, {})
            # TODO check the dataset
            # check_dataset_and_coders(dataset, args.encoders)

            _, response_data = run_on_dataset(
                args.tf_manager, args.runners,
                dataset, args.postprocess, write_out=False)
            code = 200
        # pylint: disable=broad-except
        except Exception as exc:
            response_data = {'error': str(exc)}
            code = 400

    response_data['duration'] = (
        datetime.datetime.now() - start_time).total_seconds()
    json_response = json.dumps(response_data)
    response = flask.Response(json_response,
                              content_type='application/json; charset=utf-8')
    response.headers.add('content-length', len(json_response.encode('utf-8')))
    response.status_code = code
    return response
예제 #23
0
    def execute(self,
                dataset: Dataset,
                execution_scripts,
                train=False,
                compute_losses=True,
                summaries=True,
                batch_size=None) -> List[ExecutionResult]:
        if batch_size is None:
            batch_size = len(dataset)
        batched_dataset = dataset.batch_dataset(batch_size)

        batch_results = [[] for _ in execution_scripts
                         ]  # type: List[List[ExecutionResult]]
        for batch in batched_dataset:
            executables = [
                s.get_executable(compute_losses=compute_losses,
                                 summaries=summaries)
                for s in execution_scripts
            ]
            while not all(ex.result is not None for ex in executables):
                all_feedables = set()  # type: Set[Any]
                # type: Dict[Executable, tf.Tensor]
                all_tensors_to_execute = {}
                additional_feed_dicts = []
                tensor_list_lengths = []  # type: List[int]

                for executable in executables:
                    if executable.result is None:
                        (feedables, tensors_to_execute,
                         add_feed_dict) = executable.next_to_execute()
                        all_feedables = all_feedables.union(feedables)
                        all_tensors_to_execute[executable] = tensors_to_execute
                        additional_feed_dicts.append(add_feed_dict)
                        tensor_list_lengths.append(len(tensors_to_execute))
                    else:
                        tensor_list_lengths.append(0)

                feed_dict = _feed_dicts(batch, all_feedables, train=train)
                for fdict in additional_feed_dicts:
                    feed_dict.update(fdict)

                session_results = [
                    sess.run(all_tensors_to_execute, feed_dict=feed_dict)
                    for sess in self.sessions
                ]

                for executable in executables:
                    if executable.result is None:
                        executable.collect_results(
                            [res[executable] for res in session_results])

            for script_list, executable in zip(batch_results, executables):
                script_list.append(executable.result)

        collected_results = []  # type: List[ExecutionResult]
        for result_list in batch_results:
            collected_results.append(reduce_execution_results(result_list))

        return collected_results
예제 #24
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        # if it is from the pickled file, it is a list, not a numpy tensor,
        # so convert it as as a prevention
        images = np.array(list(dataset.get_series(self.data_id)))
        fd[self.image_input] = images / 255.0
        return fd
예제 #25
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        # if it is from the pickled file, it is a list, not a numpy tensor,
        # so convert it as as a prevention
        images = np.array(list(dataset.get_series(self.data_id)))
        fd[self.image_input] = images / 255.0
        return fd
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        images = np.array(dataset.get_series(self.data_id))
        assert images.shape[1:] == (self.height, self.width, 3)
        fd[self.input_image] = images

        return fd
예제 #27
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        sentences = dataset.maybe_get_series(self.data_id)
        if sentences is not None:
            fd[self.target_tokens] = pad_batch(list(sentences))

        return fd
예제 #28
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        sentences = dataset.maybe_get_series(self.data_id)
        sentences_list = list(sentences) if sentences is not None else None
        if sentences_list is not None:
            fd[self.train_inputs] = list(zip(*sentences_list))[0]

        return fd
예제 #29
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        sentences = dataset.maybe_get_series(self.data_id)
        sentences_list = list(sentences) if sentences is not None else None
        if sentences_list is not None:
            fd[self.train_inputs] = list(zip(*sentences_list))[0]

        return fd
예제 #30
0
def run(data):  # pragma: no cover
    exp = APP.config["experiment"]
    dataset = Dataset("request",
                      data,
                      BatchingScheme(batch_size=1), {},
                      preprocessors=APP.config["preprocess"])

    _, response_data, _ = exp.run_model(dataset, write_out=False)

    return response_data
예제 #31
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        # if it is from the pickled file, it is list, not numpy tensor,
        # so convert it as as a prevention
        images = np.array(dataset.get_series(self.data_id))

        f_dict = {}
        f_dict[self.image_input] = images / 225.0

        f_dict[self.train_mode] = train
        return f_dict
예제 #32
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        sentences = dataset.maybe_get_series(self.data_id)
        if sentences is not None:
            fd[self.target_tokens] = pad_batch(
                list(sentences), self.max_output_len, self.add_start_symbol,
                self.add_end_symbol)

        return fd
예제 #33
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)
        sentences = dataset.maybe_get_series(self.data_id)

        if sentences is not None:
            label_tensors, _ = self.vocabulary.sentences_to_tensor(
                list(sentences), self.max_output_len)
            fd[self.gt_inputs[0]] = label_tensors[0]

        return fd
예제 #34
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        sentences = dataset.maybe_get_series(self.data_id)
        if sentences is not None:
            vectors, _ = self.vocabulary.sentences_to_tensor(
                list(sentences), pad_to_max_len=False, train_mode=train)

            fd[self.train_targets] = vectors.T
        return fd
예제 #35
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)
        sentences = dataset.maybe_get_series(self.data_id)

        if sentences is not None:
            labels = [l[0] for l in pad_batch(list(sentences),
                                              self.max_output_len)]
            fd[self.targets] = labels

        return fd
예제 #36
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Populate the feed dictionary with the encoder inputs.

        Arguments:
            dataset: The dataset to use
            train: Boolean flag telling whether it is training time
        """
        fd = ModelPart.feed_dict(self, dataset, train)
        sentences = dataset.get_series(self.data_id)
        fd[self.input_tokens] = pad_batch(list(sentences), self.max_input_len)
        return fd
예제 #37
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        sentences = dataset.maybe_get_series(self.data_id)

        if sentences is None and train:
            raise ValueError("You must feed reference sentences when training")

        if sentences is not None:
            fd[self.target_tokens] = pad_batch(list(sentences),
                                               self.max_length)

        return fd
예제 #38
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Populate the feed dictionary for the decoder object.

        Arguments:
            dataset: The dataset to use for the decoder.
            train: Boolean flag, telling whether this is a training run.
        """
        fd = ModelPart.feed_dict(self, dataset, train)

        sentences = dataset.maybe_get_series(self.data_id)

        if sentences is None and train:
            raise ValueError("When training, you must feed "
                             "reference sentences")

        if sentences is not None:
            fd[self.train_tokens] = pad_batch(
                list(sentences), self.max_output_len, add_start_symbol=False,
                add_end_symbol=True)

        return fd
예제 #39
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        """Feed the placholders with the data.

        Arguments:
            dataset: The dataset.
            train: A flag whether the train mode is enabled.

        Returns:
            The constructed feed dictionary that contains the factor data and
            the mask.
        """
        fd = ModelPart.feed_dict(self, dataset, train)

        # for checking the lengths of individual factors
        for factor_plc, name in zip(self.input_factors, self.data_ids):
            sentences = dataset.get_series(name)
            fd[factor_plc] = pad_batch(
                list(sentences), self.max_length, self.add_start_symbol,
                self.add_end_symbol)

        return fd
예제 #40
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        series = list(dataset.get_series(self.data_id))
        lengths = []
        inputs = []

        max_len = max(x.shape[0] for x in series)
        if self.max_input_len is not None:
            max_len = min(self.max_input_len, max_len)

        for x in series:
            length = min(max_len, x.shape[0])
            x_padded = np.zeros(shape=(max_len,) + x.shape[1:],
                                dtype=x.dtype)
            x_padded[:length] = x[:length]

            lengths.append(length)
            inputs.append(x_padded)

        fd[self.temporal_states] = inputs
        fd[self._input_lengths] = lengths

        return fd
예제 #41
0
 def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
     fd = ModelPart.feed_dict(self, dataset, train)
     fd[self.spatial_input] = list(dataset.get_series(self.data_id))
     return fd
예제 #42
0
 def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
     fd = ModelPart.feed_dict(self, dataset, train)
     fd[self.vector] = dataset.get_series(self.data_id)
     return fd