示例#1
0
    def __getitem__(self, index: int) -> Mapping[str, Any]:
        """Fetch a data instance at a specified index, and apply transformations to it.

        Args:
            index: Which datapoint to retrieve.

        Returns:
            The data dictionary from the specified index, with transformations applied.
        """
        items = deepcopy(
            self.dataset[index]
        )  # Deepcopy to prevent ops from overwriting values in datasets
        if isinstance(self.dataset, BatchDataset):
            unique_list = []
            for item in items:
                if id(item) not in unique_list:
                    forward_numpyop(self.ops, item, self.mode)
                    unique_list.append(id(item))
            if self.dataset.pad_value is not None:
                pad_batch(items, self.dataset.pad_value)
            items = {
                key: np.array([item[key] for item in items])
                for key in items[0]
            }
        else:
            forward_numpyop(self.ops, items, self.mode)
        return items
示例#2
0
    def __getitem__(self, index: int) -> Mapping[str, Any]:
        """Fetch a data instance at a specified index, and apply transformations to it.

        Args:
            index: Which datapoint to retrieve.

        Returns:
            The data dictionary from the specified index, with transformations applied.
        """
        items = deepcopy(
            self.dataset[index]
        )  # Deepcopy to prevent ops from overwriting values in datasets
        if isinstance(self.dataset, BatchDataset):
            # BatchDataset may randomly sample the same elements multiple times, so need to avoid reprocessing
            unique_samples = set()
            for item in items:
                if id(item) not in unique_samples:
                    forward_numpyop(self.ops, item, {'mode': self.mode})
                    unique_samples.add(id(item))
            if self.dataset.pad_value is not None:
                pad_batch(items, self.dataset.pad_value)
            items = {
                key: np.array([item[key] for item in items])
                for key in items[0]
            }
        else:
            forward_numpyop(self.ops, items, {'mode': self.mode})
        return items
示例#3
0
    def transform(
            self,
            data: Dict[str, Any],
            mode: str,
            epoch: int = 1,
            ds_id: str = '',
            target_type: str = 'np') -> Union[Dict[str, Any], FilteredData]:
        """Apply all pipeline operations on a given data instance for the specified `mode` and `epoch`.

        Args:
            data: Input data in dictionary format.
            mode: The execution mode in which to run. This can be "train", "eval", "test" or "infer".
            epoch: The epoch index to run. Note that epoch indices are 1-indexed.
            ds_id: The current dataset id.
            target_type: What kind of tensor(s) to create. One of "tf", "torch", or "np".

        Returns:
            The transformed data.
        """
        data = deepcopy(data)
        instance_ops, batch_spec, batch_ops = self._get_op_split(mode=mode,
                                                                 epoch=epoch,
                                                                 ds_id=ds_id)
        state = {'mode': mode}
        op_data = forward_numpyop(instance_ops, data, state)
        if isinstance(op_data, FilteredData):
            return op_data
        data = batch_spec.collate_fn([data])
        op_data = forward_numpyop(batch_ops, data, state, batched='torch')
        if isinstance(op_data, FilteredData):
            return op_data
        return to_tensor(data, target_type=target_type)
示例#4
0
    def transform(self, data: Dict[str, Any], mode: str, epoch: int = 1) -> Dict[str, Any]:
        """Apply all pipeline operations on a given data instance for the specified `mode` and `epoch`.

        Args:
            data: Input data in dictionary format.
            mode: The execution mode in which to run. This can be "train", "eval", "test" or "infer".
            epoch: The epoch index to run. Note that epoch indices are 1-indexed.

        Returns:
            The transformed data.
        """
        data = deepcopy(data)
        ops = get_current_items(self.ops, mode, epoch)
        forward_numpyop(ops, data, {'mode': mode})
        for key, value in data.items():
            data[key] = np.expand_dims(value, 0)
        return data
示例#5
0
    def __getitem__(
        self, index: int
    ) -> Union[Mapping[str, Any], List[Mapping[str, Any]], FilteredData]:
        """Fetch a data instance at a specified index, and apply transformations to it.

        Args:
            index: Which datapoint to retrieve.

        Returns:
            The data dictionary from the specified index, with transformations applied OR an indication that this index
            should be thrown out.
        """
        item = self.dataset[index]
        if isinstance(item, list):
            # BatchDataset may randomly sample the same elements multiple times, so need to avoid reprocessing
            unique_samples = {}  # id: idx
            results = []
            for idx, data in enumerate(item):
                data_id = id(data)
                if data_id not in unique_samples:
                    data = _DelayedDeepDict(data)
                    filter_data = forward_numpyop(self.ops, data,
                                                  {'mode': self.mode})
                    if filter_data:
                        results.append(filter_data)
                    else:
                        data.finalize(retain=self.output_keys,
                                      deep_remainder=self.deep_remainder)
                        results.append(data)
                    unique_samples[data_id] = idx
                else:
                    results.append(results[unique_samples[data_id]])
        else:
            results = _DelayedDeepDict(item)
            filter_data = forward_numpyop(self.ops, results,
                                          {'mode': self.mode})
            if filter_data:
                return filter_data
            results.finalize(retain=self.output_keys,
                             deep_remainder=self.deep_remainder)
        return results
示例#6
0
    def __getitem__(self, index: int) -> Mapping[str, Any]:
        """Fetch a data instance at a specified index, and apply transformations to it.

        Args:
            index: Which datapoint to retrieve.

        Returns:
            The data dictionary from the specified index, with transformations applied.
        """
        item = self.dataset[index]
        if isinstance(item, list):
            # BatchDataset may randomly sample the same elements multiple times, so need to avoid reprocessing
            unique_samples = {}  # id: idx
            results = []
            for idx, data in enumerate(item):
                data_id = id(data)
                if data_id not in unique_samples:
                    data = _DelayedDeepDict(data)
                    forward_numpyop(self.ops, data, {'mode': self.mode})
                    data.finalize(retain=self.output_keys,
                                  deep_remainder=self.deep_remainder)
                    results.append(data)
                    unique_samples[data_id] = idx
                else:
                    results.append(results[unique_samples[data_id]])
            if hasattr(self.dataset,
                       "pad_value") and self.dataset.pad_value is not None:
                pad_batch(results, self.dataset.pad_value)
            results = {
                key: np.array([result[key] for result in results])
                for key in results[0]
            }
        else:
            results = _DelayedDeepDict(item)
            forward_numpyop(self.ops, results, {'mode': self.mode})
            results.finalize(retain=self.output_keys,
                             deep_remainder=self.deep_remainder)
        return results
示例#7
0
 def forward_batch(
     self, data: Union[np.ndarray, List[np.ndarray]], state: Dict[str, Any]
 ) -> Union[FilteredData, np.ndarray, List[np.ndarray]]:
     data = {key: elem for key, elem in zip(self.inputs, data)}
     if isinstance(self.repeat, int):
         for i in range(self.repeat):
             filtered = forward_numpyop(self.ops, data, state, batched='np')
             if filtered:
                 return filtered
     else:
         filtered = forward_numpyop(self.ops, data, state, batched='np')
         if filtered:
             return filtered
         i = 0
         while self.repeat(
                 *[data[var_name] for var_name in self.repeat_inputs]):
             if self.max_iter and i >= self.max_iter:
                 break
             filtered = forward_numpyop(self.ops, data, state, batched='np')
             if filtered:
                 return filtered
             i += 1
     return [data[key] for key in self.outputs]
示例#8
0
 def forward(self, data: List[np.ndarray],
             state: Dict[str, Any]) -> List[np.ndarray]:
     data = {key: elem for key, elem in zip(self.inputs, data)}
     if isinstance(self.repeat, int):
         for i in range(self.repeat):
             forward_numpyop(self.ops, data, state)
     else:
         forward_numpyop(self.ops, data, state)
         while self.repeat(
                 *[data[var_name] for var_name in self.repeat_inputs]):
             forward_numpyop(self.ops, data, state)
     return [data[key] for key in self.outputs]
示例#9
0
def _batch_postprocess(data: Dict[str, Any], ops: List[NumpyOp], output_keys: Set[str], mode: str) -> \
        Union[Dict[str, Any], FilteredData]:
    op_data = forward_numpyop(ops=ops,
                              data=data,
                              state={'mode': mode},
                              batched='torch')
    if isinstance(op_data, FilteredData):
        return op_data
    if output_keys:
        for key in data.keys() - output_keys:
            if key not in _DelayedDeepDict.warned:
                _DelayedDeepDict.warned.add(key)
                print(
                    "FastEstimator-Warn: the key '{}' is being pruned since it is unused outside of the Pipeline."
                    " To prevent this, you can declare the key as an input of a Trace or TensorOp."
                    .format(key))
            data.pop(key)
    return data
示例#10
0
 def forward(self, data: List[np.ndarray],
             state: Dict[str, Any]) -> List[np.ndarray]:
     data = {key: elem for key, elem in zip(self.inputs, data)}
     forward_numpyop(self.ops, data, state)
     return [data[key] for key in self.outputs]
示例#11
0
    def benchmark(self,
                  mode: str = "train",
                  epoch: int = 1,
                  ds_id: Optional[str] = None,
                  num_steps: int = 1000,
                  log_interval: int = 100,
                  detailed: bool = True) -> None:
        """Benchmark the pipeline processing speed.

        Args:
            mode: The execution mode to benchmark. This can be 'train', 'eval' or 'test'.
            epoch: The epoch index to benchmark. Note that epoch indices are 1-indexed.
            ds_id: The ds_id to benchmark. If None, all ds_ids will be benchmarked.
            num_steps: The number of steps over which to perform the benchmark.
            log_interval: The logging interval.
            detailed: Whether to display the detailed time used by each operator.
        """
        if ds_id is None:
            ds_ids = self.get_ds_ids(epoch=epoch, mode=mode)
        else:
            ds_ids = [ds_id]

        for ds_id in ds_ids:
            with self(mode=mode,
                      epoch=epoch,
                      ds_id=ds_id,
                      steps_per_epoch=num_steps) as loader:
                if isinstance(loader, tf.data.Dataset):
                    loader = loader.take(num_steps)
                start = time.perf_counter()
                for idx, _ in enumerate(loader, start=1):
                    if idx % log_interval == 0:
                        duration = time.perf_counter() - start
                        iters_per_sec = log_interval / duration
                        ds_str = f"Dataset: {ds_id}, " if ds_id else ""
                        print(
                            "FastEstimator-Benchmark ({}): {}Step: {}, Epoch: {}, Steps/sec: {}"
                            .format(mode.capitalize(), ds_str, idx, epoch,
                                    iters_per_sec))
                        start = time.perf_counter()
                # Pipeline Operations Benchmarking when using FEDataset
                if isinstance(loader, FEDataLoader) and isinstance(
                        loader.dataset, OpDataset) and detailed:
                    # (n_visited, duration)
                    duration_list = np.zeros(shape=(len(self.ctx_ops) + 1 +
                                                    len(self.ctx_batch_ops),
                                                    2))
                    data_len = len(loader.dataset)
                    ds_str = f", Dataset: {ds_id}" if ds_id else ""
                    print(
                        "\nBreakdown of time taken by Pipeline Operations (Mode: {}, Epoch: {}{})\n"
                        .format(mode.capitalize(), epoch, ds_str))
                    extra_memory_management_time = 0
                    for _ in range(log_interval):
                        filtered = False
                        batch = []
                        index = np.random.randint(data_len)
                        items = deepcopy(loader.dataset.dataset[index])
                        if isinstance(items, list):
                            while not batch:
                                filtered = False
                                # BatchDataset may randomly sample the same elements multiple times, avoid reprocessing
                                unique_samples = set()
                                for item in items:
                                    if id(item) not in unique_samples:
                                        for i, op in enumerate(self.ctx_ops):
                                            start = time.perf_counter()
                                            op_data = forward_numpyop(
                                                [op], item,
                                                {'mode': loader.dataset.mode})
                                            duration = time.perf_counter(
                                            ) - start
                                            duration_list[i][0] += 1
                                            duration_list[i][1] += duration
                                            if isinstance(
                                                    op_data, FilteredData):
                                                filtered = True
                                                break
                                        unique_samples.add(id(item))
                                if not filtered:
                                    batch = items
                        else:
                            while len(batch) < (self.ctx_batch_size or 1):
                                filtered = False
                                for i, op in enumerate(self.ctx_ops):
                                    start = time.perf_counter()
                                    op_data = forward_numpyop([op], items,
                                                              {'mode': mode})
                                    duration = time.perf_counter() - start
                                    duration_list[i][0] += 1
                                    duration_list[i][1] += duration
                                    if isinstance(op_data, FilteredData):
                                        filtered = True
                                        break
                                if not filtered:
                                    batch.append(items)
                                index = np.random.randint(data_len)
                                items = deepcopy(loader.dataset.dataset[index])
                        if not filtered:
                            # Perform the batching
                            start = time.perf_counter()
                            batch = self.ctx_batch_info.collate_fn(batch)
                            duration = time.perf_counter() - start
                            duration_list[len(self.ctx_ops)][0] += 1
                            duration_list[len(self.ctx_ops)][1] += duration
                            # Perform batch ops
                            start = time.perf_counter()
                            # Transform to numpy to not bias against the first op in the batch_op chain
                            batch = to_tensor(batch, target_type='np')
                            extra_memory_management_time += time.perf_counter(
                            ) - start

                            for i, op in enumerate(self.ctx_batch_ops,
                                                   start=len(self.ctx_ops) +
                                                   1):
                                start = time.perf_counter()
                                op_data = forward_numpyop([op],
                                                          data=batch,
                                                          state={'mode': mode},
                                                          batched='np')
                                duration = time.perf_counter() - start
                                duration_list[i][0] += 1
                                duration_list[i][1] += duration
                                if isinstance(op_data, FilteredData):
                                    break
                            # Count extra time needed to cast data back to torch
                            start = time.perf_counter()
                            to_tensor(batch,
                                      target_type='torch',
                                      shared_memory=True)
                            extra_memory_management_time += time.perf_counter(
                            ) - start

                    if self.ctx_batch_ops:
                        # Extra memory management penalty is only incurred when using batch ops
                        duration_list[len(
                            self.ctx_ops)][1] += extra_memory_management_time

                    total_time = np.sum(duration_list[:, 1])
                    normalized_times_ms = 1000 * duration_list[:, 1] / np.maximum(
                        duration_list[:, 0], 1)
                    op_names = ["Op"]

                    for op in self.ctx_ops + [self.ctx_batch_info
                                              ] + self.ctx_batch_ops:
                        if isinstance(op, Sometimes) and op.op:
                            op_names.append(op.__class__.__name__ + " (" +
                                            op.op.__class__.__name__ + ")")
                        elif isinstance(op, Repeat) and op.op:
                            op_names.append(op.__class__.__name__ + " (" +
                                            op.op.__class__.__name__ + ")")
                        elif isinstance(op, OneOf) and op.ops:
                            op_names.append(op.__class__.__name__ + " (" +
                                            ", ".join([
                                                sub_op.__class__.__name__
                                                for sub_op in op.ops
                                            ]) + ")")
                        elif isinstance(op, Fuse) and op.ops:
                            op_names.append(op.__class__.__name__ + " (" +
                                            ", ".join([
                                                sub_op.__class__.__name__
                                                for sub_op in op.ops
                                            ]) + ")")
                        elif isinstance(op, Batch):
                            op_names.append("<Collating Batch>")
                        else:
                            op_names.append(op.__class__.__name__)

                    max_op_len = max(len(op_name) for op_name in op_names)
                    max_in_len = max([
                        len(", ".join(op.inputs)) for op in self.ctx_ops +
                        [self.ctx_batch_info] + self.ctx_batch_ops
                    ] + [len("Inputs")])
                    max_out_len = max([
                        len(", ".join(op.outputs)) for op in self.ctx_ops +
                        [self.ctx_batch_info] + self.ctx_batch_ops
                    ] + [len("Outputs")])
                    ms_visit_len = max(
                        len("{:.3f}".format(max(normalized_times_ms))),
                        len("ms / Visit"))
                    visit_len = max(len(f"{int(np.max(duration_list[:, 0]))}"),
                                    len("Visits"))

                    print("{}: {}: {}: {}: {}: {}".format(
                        "Op".ljust(max_op_len + 1),
                        "Inputs".ljust(max_in_len + 1),
                        "Outputs".ljust(max_out_len + 1),
                        "ms / Visit".ljust(ms_visit_len + 1),
                        "Visits".ljust(visit_len + 1),
                        "Time (Total)".rjust(12)))
                    print("-" * (max_op_len + max_in_len + max_out_len +
                                 visit_len + 37))
                    for i, op in enumerate(self.ctx_ops +
                                           [self.ctx_batch_info] +
                                           self.ctx_batch_ops):
                        print("{}: {}: {}: {}: {}: {:11.2f}%".format(
                            op_names[i + 1].ljust(max_op_len + 1),
                            ", ".join(op.inputs).ljust(max_in_len + 1),
                            ", ".join(op.outputs).ljust(max_out_len + 1),
                            "{:.3f}".format(
                                normalized_times_ms[i]).ljust(ms_visit_len +
                                                              1),
                            str(int(duration_list[i][0])).ljust(visit_len + 1),
                            100 * duration_list[i][1] / total_time))
                    if self.ctx_batch_ops:
                        penalty = round(
                            100 * (duration_list[len(self.ctx_ops)][1] -
                                   extra_memory_management_time) /
                            duration_list[len(self.ctx_ops)][1], 1)
                        print(
                            f"\nNote that collation time would be cut by ~{penalty}% if there were no batched ops."
                        )
                print("\n")  # to make printing more obvious
示例#12
0
 def forward_batch(
         self, data: Union[np.ndarray, List[np.ndarray]],
         state: Dict[str, Any]) -> Union[np.ndarray, List[np.ndarray]]:
     data = {key: elem for key, elem in zip(self.inputs, data)}
     filtered = forward_numpyop(self.ops, data, state, batched="np")
     return filtered if filtered else [data[key] for key in self.outputs]
示例#13
0
    def benchmark(self,
                  mode: str = "train",
                  epoch: int = 1,
                  num_steps: int = 1000,
                  log_interval: int = 100,
                  detailed: bool = True) -> None:
        """Benchmark the pipeline processing speed.

        Args:
            mode: The execution mode to benchmark. This can be 'train', 'eval' or 'test'.
            epoch: The epoch index to benchmark. Note that epoch indices are 1-indexed.
            num_steps: The maximum number of steps over which to perform the benchmark.
            log_interval: The logging interval.
            detailed: Whether to display the detailed time used by each operator.
        """
        loader = self.get_loader(mode=mode, epoch=epoch)
        if isinstance(loader, tf.data.Dataset):
            loader = loader.take(num_steps)
        start = time.perf_counter()
        for idx, _ in enumerate(loader, start=1):
            if idx % log_interval == 0:
                duration = time.perf_counter() - start
                iters_per_sec = log_interval / duration
                print(
                    "FastEstimator: Step: {}, Epoch: {}, Steps/sec: {}".format(
                        idx, epoch, iters_per_sec))
                start = time.perf_counter()
            if idx == num_steps:
                break
        # Pipeline Operations Benchmarking when using FEDataset
        if isinstance(loader, DataLoader) and isinstance(
                loader.dataset, OpDataset) and detailed:
            op_list = loader.dataset.ops
            duration_list = np.zeros(shape=(len(op_list)))

            data_len = len(loader.dataset.dataset)
            if self.batch_size:
                batch_size = self.batch_size.get_current_value(
                    epoch) if isinstance(self.batch_size,
                                         Scheduler) else self.batch_size
                batch_size = batch_size[mode] if isinstance(
                    batch_size, dict) else batch_size
                log_interval = log_interval * batch_size

            print(
                "\nBreakdown of time taken by Pipeline Operations ({} epoch {})"
                .format(mode, epoch))
            for _ in range(log_interval):
                index = np.random.randint(data_len)
                items = deepcopy(loader.dataset.dataset[index])
                if isinstance(loader.dataset.dataset, BatchDataset):
                    # BatchDataset may randomly sample the same elements multiple times, so need to avoid reprocessing
                    unique_samples = set()
                    for item in items:
                        if id(item) not in unique_samples:
                            for i, op in enumerate(op_list):
                                start = time.perf_counter()
                                forward_numpyop([op], item,
                                                {'mode': loader.dataset.mode})
                                duration = time.perf_counter() - start
                                duration_list[i] += duration
                            unique_samples.add(id(item))
                else:
                    for i, op in enumerate(op_list):
                        start = time.perf_counter()
                        forward_numpyop([op], items,
                                        {'mode': loader.dataset.mode})
                        duration = time.perf_counter() - start
                        duration_list[i] += duration

            total_time = np.sum(duration_list)
            op_names = ["Op"]

            for op in op_list:
                if isinstance(op, Sometimes) and op.op:
                    op_names.append(op.__class__.__name__ + " (" +
                                    op.op.__class__.__name__ + ")")
                elif isinstance(op, OneOf) and op.ops:
                    op_names.append(op.__class__.__name__ + " (" + ", ".join(
                        [sub_op.__class__.__name__
                         for sub_op in op.ops]) + ")")
                else:
                    op_names.append(op.__class__.__name__)

            max_op_len = max(len(op_name) for op_name in op_names)
            max_in_len = max([len(", ".join(op.inputs))
                              for op in op_list] + [len("Inputs")])
            max_out_len = max([len(", ".join(op.outputs))
                               for op in op_list] + [len("Outputs")])
            print("{}: {}: {}: {}".format("Op".ljust(max_op_len + 1),
                                          "Inputs".ljust(max_in_len + 1),
                                          "Outputs".ljust(max_out_len + 1),
                                          "Time".rjust(5)))
            print("-" * (max_op_len + max_in_len + max_out_len + 15))
            for i, op in enumerate(op_list):
                print("{}: {}: {}: {:5.2f}%".format(
                    op_names[i + 1].ljust(max_op_len + 1),
                    ", ".join(op.inputs).ljust(max_in_len + 1),
                    ", ".join(op.outputs).ljust(max_out_len + 1),
                    100 * duration_list[i] / total_time))
示例#14
0
    def benchmark(self,
                  mode: str = "train",
                  epoch: int = 1,
                  num_steps: int = 1000,
                  log_interval: int = 100) -> None:
        """Benchmark the pipeline processing speed.

        Args:
            mode: The execution mode to benchmark. This can be 'train', 'eval' or 'test'.
            epoch: The epoch index to benchmark. Note that epoch indices are 1-indexed.
            num_steps: The maximum number of steps over which to perform the benchmark.
            log_interval: The logging interval.
        """
        loader = self.get_loader(mode=mode, epoch=epoch)
        if isinstance(loader, tf.data.Dataset):
            loader = loader.take(num_steps)
        start = time.perf_counter()
        for idx, _ in enumerate(loader, start=1):
            if idx % log_interval == 0:
                duration = time.perf_counter() - start
                iters_per_sec = log_interval / duration
                print(
                    "FastEstimator: Step: {}, Epoch: {}, Steps/sec: {}".format(
                        idx, epoch, iters_per_sec))
                start = time.perf_counter()
            if idx == num_steps:
                break

        # Pipeline Operations Benchmarking
        op_list = loader.dataset.ops
        duration_list = np.zeros(shape=(len(op_list)))

        data_len = len(loader.dataset.dataset)
        if self.batch_size:
            log_interval = log_interval * self.batch_size

        print("\nBreakdown of time taken by Pipeline Operations:")
        for _ in range(log_interval):
            index = np.random.randint(data_len)
            items = deepcopy(loader.dataset.dataset[index])
            if isinstance(loader.dataset.dataset, BatchDataset):
                unique_list = []
                for item in items:
                    if id(item) not in unique_list:
                        for i, op in enumerate(op_list):
                            start = time.perf_counter()
                            forward_numpyop([op], item, loader.dataset.mode)
                            duration = time.perf_counter() - start
                            duration_list[i] += duration
                        unique_list.append(id(item))
            else:
                for i, op in enumerate(op_list):
                    start = time.perf_counter()
                    forward_numpyop([op], items, loader.dataset.mode)
                    duration = time.perf_counter() - start
                    duration_list[i] += duration

        total_time = np.sum(duration_list)
        for i, op in enumerate(op_list):
            print(" - {}: Time Consumption: {:.2f}%".format(
                op.__class__.__name__, 100 * duration_list[i] / total_time))