def check(this: DataPlaceHolders, what: str) -> List: this = listify(this) if not is_data_placeholder_list(this): raise ValueError("{} must be of type DataPlaceholder.".format(what)) if len(set(this)) != len(this): raise ValueError("{} must be unique.".format(what)) return this
def _compute_node(self, node, Xs, cache): # TODO: Raise warning if computed output is already in cache. # This happens when recomputing a step that had a subset of its outputs already passed in the inputs. # TODO: Some regressors have extra options in their predict method, and they return a tuple of arrays. # https://scikit-learn.org/stable/glossary.html#term-predict output_data = node.compute_func(unlistify(Xs)) output_data = listify(output_data) self._update_cache(cache, output_data, node)
def predict( self, X: Union[ArrayLikes, DataDict], output_names: Optional[Union[str, List[str]]] = None, ) -> ArrayLikes: """Predict by applying the model on the given input data. Parameters ---------- X Input data. It follows the same format as in the ``fit`` method. output_names Names of required outputs (optional). You can specify any final or intermediate output by passing the name of its associated data placeholder. This is useful for debugging. If not specified, it will return the outputs specified at instantiation. Returns ------- array-like or list of array-like The computed outputs. """ # Intermediate results are stored here results_cache = dict() # type: Dict[DataPlaceholder, ArrayLike] # Normalize inputs X_norm = self._normalize_data(X, self._internal_inputs) # Get required outputs if output_names is None: outputs = self._internal_outputs else: output_names = listify(output_names) if len(set(output_names)) != len(output_names): raise ValueError("output_names must be unique.") outputs = [self.get_data_placeholder(output) for output in output_names] # We allow unused inputs to allow debugging different outputs # without having to change the inputs accordingly. nodes = self._get_required_nodes( X_norm, [], outputs, allow_unused_inputs=True, follow_targets=False ) # Compute results_cache.update(X_norm) for node in nodes: Xs = [results_cache[i] for i in node.inputs] self._compute_node(node, Xs, results_cache) output_data = [results_cache[o] for o in outputs] if len(output_data) == 1: return output_data[0] else: return output_data
def _fit_compute_node(self, node, Xs, ys, cache, **fit_params): # TODO: same as _compute_node TODO? if ys: output_data = node.fit_compute_func( unlistify(Xs), unlistify(ys), **fit_params ) else: output_data = node.fit_compute_func(unlistify(Xs), **fit_params) output_data = listify(output_data) self._update_cache(cache, output_data, node)
def test_split(x, indices_or_sections, teardown): x1 = Input() ys = Split(indices_or_sections, axis=0)(x1) model = Model(x1, ys) y_expected = np.split(x, indices_or_sections, axis=0) y_pred = model.predict(x) y_pred = listify(y_pred) for actual, expected in safezip2(y_pred, y_expected): assert_array_equal(actual, expected)
def _compute_step(step, Xs, cache): # TODO: Raise warning if computed output is already in cache. # This happens when recomputing a step that had a subset of its outputs already passed in the inputs. # TODO: Some regressors have extra options in their predict method, and they return a tuple of arrays. # https://scikit-learn.org/stable/glossary.html#term-predict output_data = step.compute(*Xs) output_data = listify(output_data) try: cache.update(safezip2(step.outputs, output_data)) except ValueError as e: message = ( "The number of output data elements ({}) does not match " "the number of {} outputs ({}).".format( len(output_data), step.name, len(step.outputs))) raise RuntimeError(message) from e
def _normalize_list( data: ArrayLikes, data_placeholders: List[DataPlaceholder] ) -> Dict[DataPlaceholder, ArrayLike]: data = listify(data) try: data_norm = dict(safezip2(data_placeholders, data)) except ValueError as e: # TODO: Improve this message message = ( "When passing inputs/outputs as a list or a single array, " "the number of arrays must match the number of inputs/outputs " "specified at instantiation. " "Got {}, expected: {}.".format(len(data), len(data_placeholders)) ) raise ValueError(message) from e return data_norm
def test_listify(x, expected): assert listify(x) == expected
def __call__( self, inputs: Union[DataPlaceholder, List[DataPlaceholder]], targets: Optional[Union[DataPlaceholder, List[DataPlaceholder]]] = None, *, compute_func: Union[str, Callable[..., Any]] = "auto", fit_compute_func: Optional[Union[str, Callable[..., Any]]] = "auto", trainable: bool = True ) -> Union[DataPlaceholder, List[DataPlaceholder]]: """Call the step on input(s) (from previous steps) and generates the output(s) to be used in further steps. You can call the same step on different inputs and targets to reuse the step (similar to the concept of shared layers and nodes in Keras), and specify a different ``compute_func``/``trainable`` configuration on each call. This is achieved via "ports": each call creates a new port and associates the given configuration to it. You may access the configuration at each port using the ``get_*_at(port)`` methods. Parameters ---------- inputs Input(s) to the step. targets Target(s) to the step. compute_func Specifies which function must be used when computing the step during the model graph execution. If ``"auto"`` (default), it will use the ``predict`` or the ``transform`` method (in that order). If a name string is passed, it will use the method that matches the given name. If a callable is passed, it will use that callable when computing the step. The number of inputs and outputs of the function must match those of the step (this is not checked, but will raise an error during graph execution if there is a mismatch). scikit-learn classes typically implement a ``predict`` method (Estimators) or a ``transform`` method (Transformers), but with this argument you can, for example, specify ``predict_proba`` as the compute function. fit_compute_func Specifies which function must be used when fitting AND computing the step during the model graph execution. If ``"auto"`` (default), it will use the ``fit_predict`` or the ``fit_transform`` method (in that order) if they are implemented, otherwise it will be disabled. If a name string is passed, it will use the method that matches the given name. If a callable is passed, it will use that callable when fitting the step. If ``None`` is passed it will be ignored during graph execution. The number of inputs, outputs and targets, of the function must match those of the step (this is not checked, but will raise an error during graph execution if there is a mismatch). By default, when a model is fit, the graph engine will for each step 1) execute ``fit`` to fit the step, and then 2) execute ``compute_func`` to compute the outputs required by successor steps. If a step specifies a ``fit_compute_func``, the graph execution will use that instead to fit and compute the outputs in a single call. This can be useful for 1. leveraging implementations of ``fit_transform`` that are more efficient than calling ``fit`` and ``transform`` separately, 2. using transductive estimators, 3. implementing training protocols such as that of stacked classifiers, where the classifier in the first stage might compute out-of-fold predictions. trainable Whether the step is trainable (True) or not (False). This flag is only meaningful only for steps with a fit method. Setting ``trainable=False`` allows to skip the step when fitting a Model. This is useful if you want to freeze some pre-trained steps. Returns ------- DataPlaceholder Output(s) of the step. """ inputs = listify(inputs) if not is_data_placeholder_list(inputs): raise ValueError("inputs must be of type DataPlaceholder.") if targets is not None: if not hasattr(self, "fit"): raise RuntimeError( "Cannot pass targets to steps that do not have a fit method." ) # TODO: Consider inspecting the fit signature to determine whether the step # needs a target (i.e. fit(self, X, y)) or not (i.e. fit(self, X, y=None)). # The presence of a default of None for the target might not be reliable # though, as there could be estimators (perhaps semi-supervised) that can take # both target data and None. Also, sklearn has meta-estimators (e.g. Pipeline) # and meta-transformers (e.g. SelectFromModel) that accept both target data # and None. # # Adding this inspection, however, could simplify the API by rejecting early # unnecessary targets (e.g. passing targets to PCA) or warning missing targets # (e.g. not passing targets to LogisticRegression with trainable=True). This # also avoids unintuitive logic to allow superfluous targets during step call, # model instantiation and model fit. # # | requires target | trainable | passed target | result | # ---------------------------------------------------------------- # | yes | True | yes | ok | # | yes | True | no | warn | # | yes | False | yes | warn | # | yes | False | no | ok | # | no | - | yes | error | # | no | - | no | ok | if not trainable: warnings.warn( UserWarning( "You are passing targets to a non-trainable step.")) targets = listify(targets) if not is_data_placeholder_list(targets): raise ValueError( "If specified, targets must be of type DataPlaceholder.") else: targets = [] outputs = self._build_outputs() self._nodes.append( Node( self, inputs, outputs, targets, getattr(self, "fit", None), self._check_compute_func(compute_func), self._check_fit_compute_func(fit_compute_func), trainable, )) if self._n_outputs == 1: return outputs[0] else: # Return a shallow copy to avoid modifying self._outputs when # using the idiom of passing a variable holding an output to # another step and re-writing the variable with the new output: # zs = SomeMultiOutputStep()(...) # zs[i] = SomeStep()(zs[i]) return list(outputs)
def predict( self, X: Union[ArrayLikes, DataDict], output_names: Optional[Union[str, List[str]]] = None, ) -> ArrayLikes: """ **Models are query-able**. That is, you can request other outputs other than those specified at model instantiation. This allows querying intermediate outputs and ease debugging. Parameters ---------- X Input data. It follows the same format as in the fit function. output_names Names of required outputs (optional). You can specify any final or intermediate output by passing the name of its associated data placeholder. If not specified, it will return the outputs specified at instantiation. Returns ------- The computed outputs. """ # Intermediate results are stored here results_cache = dict() # type: Dict[DataPlaceholder, ArrayLike] # Normalize inputs X_norm = self._normalize_data(X, self._internal_inputs) # Get required outputs if output_names is None: outputs = self._internal_outputs else: output_names = listify(output_names) if len(set(output_names)) != len(output_names): raise ValueError("output_names must be unique.") outputs = [ self.get_data_placeholder(output) for output in output_names ] # We allow unused inputs to allow debugging different outputs # without having to change the inputs accordingly. steps = self._get_required_steps(X_norm, [], outputs, allow_unused_inputs=True, follow_targets=False) # Compute results_cache.update(X_norm) for step in steps: Xs = [results_cache[i] for i in step.inputs] self._compute_step(step, Xs, results_cache) output_data = [results_cache[o] for o in outputs] if len(output_data) == 1: return output_data[0] else: return output_data
def __call__( self, inputs: Union[DataPlaceholder, List[DataPlaceholder]], targets: Optional[Union[DataPlaceholder, List[DataPlaceholder]]] = None, ) -> Union[DataPlaceholder, List[DataPlaceholder]]: """Call the step on input(s) (from previous steps) and generates the output(s) to be used in further steps. Parameters ---------- inputs Input(s) to the step. targets Target(s) to the step. Returns ------- DataPlaceholder Output(s) of the step. Notes ----- Currently, calling the same step on different inputs and targets to reuse the step (similar to the concept of shared layers and nodes in Keras) is not supported. Calling a step twice on different inputs will override the connectivity from the first call. Support for shareable steps might be added in future releases. """ inputs = listify(inputs) if not is_data_placeholder_list(inputs): raise ValueError("inputs must be of type DataPlaceholder.") if targets is not None: if not hasattr(self, "fit"): raise RuntimeError( "Cannot pass targets to steps that do not have a fit method." ) # TODO: Consider inspecting the fit signature to determine whether the step # needs a target (i.e. fit(self, X, y)) or not (i.e. fit(self, X, y=None)). # The presence of a default of None for the target might not be reliable # though, as there could be estimators (perhaps semi-supervised) that can take # both target data and None. Also, sklearn has meta-estimators (e.g. Pipeline) # and meta-transformers (e.g. SelectFromModel) that accept both target data # and None. # # Adding this inspection, however, could simplify the API by rejecting early # unnecessary targets (e.g. passing targets to PCA) or warning missing targets # (e.g. not passing targets to LogisticRegression with trainable=True). This # also avoids unintuitive logic to allow superfluous targets during step call, # model instantiation and model fit. # # | requires target | trainable | passed target | result | # ---------------------------------------------------------------- # | yes | True | yes | ok | # | yes | True | no | warn | # | yes | False | yes | warn | # | yes | False | no | ok | # | no | - | yes | error | # | no | - | no | ok | if not self.trainable: warnings.warn( UserWarning( "You are passing targets to a non-trainable step.")) targets = listify(targets) if not is_data_placeholder_list(targets): raise ValueError( "If specified, targets must be of type DataPlaceholder.") else: targets = [] self._inputs = inputs self._targets = targets self._outputs = self._build_outputs() if self._n_outputs == 1: return self._outputs[0] else: # Return a shallow copy to avoid modifying self._outputs when # using the idiom of passing a variable holding an output to # another step and re-writing the variable with the new output: # zs = SomeMultiOutputStep()(...) # zs[i] = SomeStep()(zs[i]) return list(self.outputs)