def wrapped(cells): x = [] for i in range(nr_args): cell = (cells.iloc[i] if hasattr(cells, 'iloc') else cells[i]) if nr_args > 1 else cells if isinstance(cell, (np.ndarray, pd.Series, pd.DataFrame, Number, list, tuple, set)): if hasattr(cell, 'item') and sum(cell.shape) <= 1: x.append( t.Tensor( unpack_nested_arrays(cell.item()) if cell.dtype == 'object' else [cell.item()])) elif hasattr(cell, 'values'): x.append( t.Tensor( unpack_nested_arrays(cell.values) if cell.dtype == 'object' else cell.values)) else: x.append( t.Tensor( cell if isinstance(cell, Iterable) else [cell])) else: x.append(cell) x = func(*x) return ( x.numpy() if sum(x.shape) > 1 else x.item()) if return_numpy else x
def calculate_loss(self, fold, x, y_true, weight) -> float: skm = self.sk_model y_pred = skm.predict( _AbstractSkModel.reshape_rnn_as_ar(unpack_nested_arrays(x))) y_true = unpack_nested_arrays( y_true, split_multi_index_rows=False).reshape(y_pred.shape) w = weight.values.reshape(-1, ) if weight is not None else None return metrics.mean_squared_error(y_true, y_pred, sample_weight=w)
def get_values(self, split_multi_index_rows=True, squeeze=False, dtype=None): # get raw values values = unpack_nested_arrays(self.df, split_multi_index_rows, dtype) # return in multi level shape if multi index is used def reshape_when_multi_index_column(values): if has_indexed_columns(self.df) and isinstance( self.df.columns, pd.MultiIndex): index_shape = multi_index_shape(self.df.columns) try: # try to reshape the nested arrays into the shape of the multi index values = values.reshape((values.shape[0], ) + index_shape + values.shape[len(index_shape):]) except ValueError as ve: # but it might well be that the shapes do not match, then just ignore the index shape if not "cannot reshape array" in str(ve): raise ve if squeeze and values.ndim > 2 and values.shape[2] == 1: values = values.reshape(values.shape[:-1]) return values # if values is list reshape each array return [reshape_when_multi_index_column(v) for v in values] if isinstance(values, List) else \ reshape_when_multi_index_column(values)
def test_nested_values_invalid_shape(self): """given a non-symetrical nested array""" df = pd.DataFrame({ "a": [[1, 2] for _ in range(5)], "b": [[1, 2, 3] for _ in range(5)], }) """when extracted then shape can not be derived""" self.assertRaises(ValueError, lambda: unpack_nested_arrays(df))
def test_nested_values(self): """given a symetrical nested array""" df = pd.DataFrame({ "a": [[1, 2] for _ in range(5)], "b": [[1, 2] for _ in range(5)], }) """when extracted then shape is 5,2,2""" self.assertEqual((5, 2, 2), unpack_nested_arrays(df).shape)
def ecdf(v): if isinstance(v, (_pd.DataFrame, _pd.Series)): v = unpack_nested_arrays(v) shape = v.shape x = v.flatten() x = _np.sort(x) return ((_np.searchsorted(x, v, side='right') + 1) / len(v)).reshape(shape)
def fit_batch(self, x: pd.DataFrame, y: pd.DataFrame, weight: pd.DataFrame, fold: int, **kwargs): # convert data frames to numpy arrays _x = _AbstractSkModel.reshape_rnn_as_ar( unpack_nested_arrays(x, split_multi_index_rows=False)) _y = unpack_nested_arrays(y, split_multi_index_rows=False) _w = unpack_nested_arrays(weight, split_multi_index_rows=False) _y = _y.reshape( (len(_x), -1)) if _y.ndim > 1 and _y.shape[1] == 1 else _y _y = _y.reshape(len(_x)) if _y.ndim == 2 and _y.shape[1] == 1 else _y if self._label_shape is None: self._label_shape = _y.shape par = self._fit_meta_data partial_fit = any([ size > 1 for size in [par.epochs, par.batch_size, par.fold_epochs] if size is not None ]) if partial_fit: # use partial fit whenever possible partial_fit if hasattr(self.sk_model, "partial_fit"): kw_classes = { "classes": kwargs["classes"] } if "classes" in kwargs else {} try: self.sk_model = self.sk_model.partial_fit( _x, _y, **kw_classes) except Exception as e: if "classes" in kwargs: raise e else: raise ValueError( "You might need to pass 'classes' argument for partial fitting", e) else: raise ValueError( f"This of model does not support `partial_fit` {type(self.sk_model)} - " f"and therefore does not support epochs or batches.") else: self.sk_model = self.sk_model.fit(_x, _y)
def calculate_loss(self, fold, x, y_true, weight): skm = self.sk_model y_pred = self._predict(skm, x, fold=fold) y_true = unpack_nested_arrays( y_true, split_multi_index_rows=False).reshape(y_pred.shape) w = weight.values.reshape(-1, ) if weight is not None else None if isinstance(self.sk_model, ClassifierMixin): # calculate: # sklearn.metrics.log_loss return metrics.log_loss(y_true, y_pred, sample_weight=w) else: # calculate: metrics.mean_squared_error return metrics.mean_squared_error(y_true, y_pred, sample_weight=w)
def test_nested_values_row_multiindex(self): """given a row-MultiIndex DataFrame""" df = pd.DataFrame(np.ones((10, 3)), index=pd.MultiIndex.from_tuples([ *[("A", i) for i in range(7)], *[("B", i) for i in range(7, 10)], ])) """when extracting values""" values = unpack_nested_arrays(df) """then we have a list of numpy arrays""" self.assertEqual(2, len(values)) self.assertEqual((7, 3), values[0].shape) self.assertEqual((3, 3), values[1].shape)
def test_nested_values_column_multiindex(self): """given a symetrical nested array""" df = pd.DataFrame(pd.DataFrame([[np.array([1, 2]) for _ in range(5)], [np.array([1, 2]) for _ in range(5)], [np.array([1, 2]) for _ in range(5)], [np.array([1, 2]) for _ in range(5)]]).T.values, columns=pd.MultiIndex.from_tuples([ ("A", 0), ("A", 1), ("B", 0), ("B", 1) ])) print(df) """when extracted then shape is 5,2,2""" self.assertEqual((5, 4, 2), unpack_nested_arrays(df).shape)
def values(self) -> np.ndarray: """ In contrast to pandas.values the ml.values returns a n-dimensional array with respect to MultiIndex and/or nested numpy arrays inside of cells :return: numpy array with shape of MultiIndex and/or nested arrays from cells """ # get raw values values = unpack_nested_arrays(self.df) # return in multi level shape if multi index is used if hasattr(self.df, 'columns') and isinstance(self.df.columns, pd.MultiIndex): index_shape = multi_index_shape(self.df.columns) values = values.reshape((values.shape[0],) + index_shape + values.shape[len(index_shape):]) return values
def _decode(self, latent_features: pd.DataFrame, samples, **kwargs) -> Typing.PatchedDataFrame: skm = self.sk_model if not hasattr(skm, 'coefs_'): raise ValueError("Model needs to be 'fit' first!") decoder = call_callable_dynamic_args( MLPRegressor, **{ "hidden_layer_sizes": self.decoder_layers, **self.kwargs }) decoder.coefs_ = skm.coefs_[len(self.encoder_layers):].copy() decoder.intercepts_ = skm.intercepts_[len(self.encoder_layers):].copy() decoder.n_layers_ = len(decoder.coefs_) + 1 decoder.n_outputs_ = self.layers[-1] decoder.out_activation_ = skm.out_activation_ decoded = decoder.predict( _AbstractSkModel.reshape_rnn_as_ar( unpack_nested_arrays(latent_features, split_multi_index_rows=False))) return to_pandas(decoded, latent_features.index, self._feature_columns)
def _predict(self, skm, features: pd.DataFrame, samples=1, **kwargs) -> np.ndarray: x = _AbstractSkModel.reshape_rnn_as_ar( unpack_nested_arrays(features, split_multi_index_rows=False)) is_probabilistic = callable(getattr(skm, 'predict_proba', None)) def predictor(): if is_probabilistic: y_hat = skm.predict_proba(x) binary_classifier = len( self._label_shape) == 1 or self._label_shape[1] == 1 return (1 - y_hat[:, 0]) if binary_classifier else y_hat.reshape( -1, *self._label_shape[1:]) else: return skm.predict(x) return np.array([predictor() for _ in range(samples)]).swapaxes( 0, 1) if samples > 1 else predictor()
def scaler(row): values = unpack_nested_arrays(row, split_multi_index_rows=False) values_2d = values.reshape(-1, 1) if normalizer == 'minmax01': return MinMaxScaler().fit(values_2d).transform( values_2d).reshape(values.shape) elif normalizer == 'minmax-11': return MinMaxScaler(feature_range=( -1, 1)).fit(values_2d).transform(values_2d).reshape( values.shape) elif normalizer == 'standard': # (value - mean) / std return values - values.mean() / np.std(values) elif normalizer == 'uniform': return ecdf(values_2d).reshape(values.shape) elif callable(normalizer): return normalizer(row) else: raise ValueError( 'unknown normalizer need to one of: [minmax01, minmax-11, uniform, standard, callable(r)]' )
def _auto_encode(self, features: pd.DataFrame, samples, **kwargs) -> Typing.PatchedDataFrame: x = _AbstractSkModel.reshape_rnn_as_ar( unpack_nested_arrays(features, split_multi_index_rows=False)) return to_pandas(self.sk_model.predict(x), features.index, self._labels_columns)