def extract_feature_labels_weights(
    df: pd.DataFrame, features_and_labels, **kwargs
) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series,
           pd.Series]:
    features = get_pandas_object(df, features_and_labels.features,
                                 **kwargs).dropna()
    labels = get_pandas_object(df, features_and_labels.labels,
                               **kwargs).dropna()
    targets = call_if_not_none(
        get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna')
    sample_weights = call_if_not_none(
        get_pandas_object(df, features_and_labels.sample_weights, **kwargs),
        'dropna')
    gross_loss = call_if_not_none(
        get_pandas_object(df, features_and_labels.gross_loss, **kwargs),
        'dropna')
    common_index = intersection_of_index(features, labels, targets,
                                         sample_weights, gross_loss)

    if features_and_labels.label_type is not None:
        labels = labels.astype(features_and_labels.label_type)

    return ((features.loc[common_index], len(df) - len(features) + 1),
            labels.loc[common_index], loc_if_not_none(targets, common_index),
            loc_if_not_none(sample_weights, common_index),
            loc_if_not_none(gross_loss, common_index))
def extract_feature_labels_weights(df: Typing.PatchedDataFrame,
                                   features_and_labels,
                                   **kwargs) -> FeaturesWithLabels:
    features, targets, latent = extract_features(df, features_and_labels,
                                                 **kwargs)
    labels = extract_labels(df, features_and_labels, **kwargs)
    sample_weights = call_if_not_none(
        get_pandas_object(df, features_and_labels.sample_weights, **kwargs),
        'dropna')
    gross_loss = call_if_not_none(
        get_pandas_object(df, features_and_labels.gross_loss, **kwargs),
        'dropna')

    # do some sanity check for any non numeric values in any of the data frames
    for frame in [features, labels, targets, sample_weights, gross_loss]:
        if frame is not None:
            # we could have nested arrays so we need to use the un-nested values
            values = flatten_nested_list(frame._.values, np.max)
            max_value = max([v.max() for v in values])

            if np.isscalar(max_value) and np.isinf(max_value):
                _log.warning(
                    f"features containing infinit number\n"
                    f"{frame[frame.apply(lambda r: np.isinf(r.values).any(), axis=1)]}"
                )
                frame.replace([np.inf, -np.inf], np.nan, inplace=True)
                frame.dropna(inplace=True)

    # now get the common index and return the filtered data frames
    common_index = intersection_of_index(features, labels, targets,
                                         sample_weights, gross_loss)

    return FeaturesWithLabels(
        FeaturesWithRequiredSamples(
            tuple([f.loc[common_index] for f in features]) if isinstance(
                features, tuple) else features.loc[common_index],
            len(df) - len(features) + 1, len(features.columns)),
        labels.loc[common_index], loc_if_not_none(latent, common_index),
        loc_if_not_none(targets, common_index),
        loc_if_not_none(sample_weights, common_index),
        loc_if_not_none(gross_loss, common_index))
예제 #3
0
def extract_feature_labels_weights(
    df: Typing.PatchedDataFrame, features_and_labels, **kwargs
) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series,
           pd.Series]:
    _, features, targets = extract_features(df, features_and_labels, **kwargs)
    labels = get_pandas_object(df, features_and_labels.labels,
                               **kwargs).dropna()
    sample_weights = call_if_not_none(
        get_pandas_object(df, features_and_labels.sample_weights, **kwargs),
        'dropna')
    gross_loss = call_if_not_none(
        get_pandas_object(df, features_and_labels.gross_loss, **kwargs),
        'dropna')

    if features_and_labels.label_type is not None:
        labels = labels.astype(features_and_labels.label_type)

    # do some sanity check for any non numeric values in any of the data frames
    for frame in [features, labels, targets, sample_weights, gross_loss]:
        if frame is not None:
            max = frame._.max()

            if np.isscalar(max) and np.isinf(max):
                _log.warning(
                    "features containing infinit number\n",
                    frame[frame.apply(lambda r: np.isinf(r.values).any(),
                                      axis=1)])
                frame.replace([np.inf, -np.inf], np.nan, inplace=True)
                frame.dropna(inplace=True)

    # now get the common index and return the filtered data frames
    common_index = intersection_of_index(features, labels, targets,
                                         sample_weights, gross_loss)

    return ((tuple([f.loc[common_index] for f in features])
             if isinstance(features, tuple) else features.loc[common_index],
             len(df) - len(features) + 1), labels.loc[common_index],
            loc_if_not_none(targets, common_index),
            loc_if_not_none(sample_weights, common_index),
            loc_if_not_none(gross_loss, common_index))
def extract_features(df: pd.DataFrame, features_and_labels,
                     **kwargs) -> Tuple[List, pd.DataFrame, pd.DataFrame]:
    features = get_pandas_object(df, features_and_labels.features,
                                 **kwargs).dropna()
    targets = call_if_not_none(
        get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna')
    common_index = intersection_of_index(features, targets)

    if len(features) <= 0:
        raise ValueError("not enough data!")

    return (features_and_labels.label_columns, features.loc[common_index],
            loc_if_not_none(targets, common_index))
예제 #5
0
def extract_feature_labels_weights(
    df: Typing.PatchedDataFrame, features_and_labels, **kwargs
) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series,
           pd.Series]:
    features = get_pandas_object(df, features_and_labels.features,
                                 **kwargs).dropna()
    labels = get_pandas_object(df, features_and_labels.labels,
                               **kwargs).dropna()
    targets = call_if_not_none(
        get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna')
    sample_weights = call_if_not_none(
        get_pandas_object(df, features_and_labels.sample_weights, **kwargs),
        'dropna')
    gross_loss = call_if_not_none(
        get_pandas_object(df, features_and_labels.gross_loss, **kwargs),
        'dropna')

    if features_and_labels.label_type is not None:
        labels = labels.astype(features_and_labels.label_type)

    for frame in [features, labels, targets, sample_weights, gross_loss]:
        if frame is not None:
            max = frame._.values.max()

            if np.isscalar(max) and np.isinf(max):
                _log.warning(
                    "features containing infinit number\n",
                    frame[frame.apply(lambda r: np.isinf(r.values).any(),
                                      axis=1)])
                frame.replace([np.inf, -np.inf], np.nan, inplace=True)
                frame.dropna(inplace=True)

    common_index = intersection_of_index(features, labels, targets,
                                         sample_weights, gross_loss)

    return ((features.loc[common_index], len(df) - len(features) + 1),
            labels.loc[common_index], loc_if_not_none(targets, common_index),
            loc_if_not_none(sample_weights, common_index),
            loc_if_not_none(gross_loss, common_index))
def extract_features(df: pd.DataFrame, features_and_labels,
                     **kwargs) -> FeaturesWithTargets:
    if isinstance(features_and_labels.features, tuple):
        # allow multiple feature sets i.e. for multi input layered networks
        features = MultiFrameDecorator([
            get_pandas_object(df, f, **kwargs).dropna()
            for f in features_and_labels.features
        ], True)
    else:
        features = get_pandas_object(df, features_and_labels.features,
                                     **kwargs).dropna()

    targets = call_if_not_none(
        get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna')
    latent = call_if_not_none(
        get_pandas_object(df, features_and_labels.latent, **kwargs), 'dropna')
    common_index = intersection_of_index(features, targets)

    if len(features) <= 0:
        raise ValueError("not enough data!")

    return FeaturesWithTargets(features.loc[common_index],
                               loc_if_not_none(targets, common_index),
                               loc_if_not_none(latent, common_index))
예제 #7
0
 def train_test_sampler(self) -> Sampler:
     train_idx, test_idx = self.splitter.train_test_split(
         self.frames[0].index)
     train = [loc_if_not_none(frame, train_idx) for frame in self.frames]
     test = [loc_if_not_none(frame, test_idx) for frame in self.frames]
     return Sampler(train, test, self.splitter.cross_validation)
예제 #8
0
    def __init__(self,
                 frames: XYWeight,
                 splitter: Callable[[Any], Tuple[pd.Index, pd.Index]] = None,
                 filter: Callable[[Any], bool] = None,
                 cross_validation: Union['BaseCrossValidator',
                                         Callable[[Any],
                                                  Generator[Tuple[np.ndarray,
                                                                  np.ndarray],
                                                            None,
                                                            None]]] = None,
                 epochs: int = 1,
                 batch_size: int = None,
                 fold_epochs: int = 1,
                 on_start: Callable = None,
                 on_epoch: Callable = None,
                 on_batch: Callable = None,
                 on_fold: Callable = None,
                 on_fold_epoch: Callable = None,
                 after_epoch: Callable = None,
                 after_batch: Callable = None,
                 after_fold: Callable = None,
                 after_fold_epoch: Callable = None,
                 after_end: Callable = None,
                 **kwargs):
        self.common_index = intersection_of_index(*frames).sort_values()
        self.frames = XYWeight(
            *[loc_if_not_none(f, self.common_index) for f in frames])
        self.epochs = epochs
        self.batch_size = batch_size
        self.fold_epochs = fold_epochs
        self.splitter = splitter
        self.filter = filter

        # callbacks
        self.on_start = on_start
        self.on_epoch = on_epoch
        self.on_batch = on_batch
        self.on_fold = on_fold
        self.on_fold_epoch = on_fold_epoch
        self.after_epoch = after_epoch
        self.after_batch = after_batch
        self.after_fold = after_fold
        self.after_fold_epoch = after_fold_epoch
        self.after_end = after_end

        # split training and test data
        if self.splitter is not None:
            if isinstance(self.common_index, pd.MultiIndex):
                _log.warning(
                    "The Data provided uses a `MultiIndex`, eventually you want to set the "
                    "`partition_row_multi_index` parameter in your splitter")

            self.train_idx, self.test_idx = call_callable_dynamic_args(
                self.splitter, self.common_index, **self.frames.to_dict())
        else:
            self.train_idx, self.test_idx = self.common_index, pd.Index([])

        if cross_validation is not None:
            if isinstance(self.common_index, pd.MultiIndex) and not isinstance(
                    cross_validation, PartitionedOnRowMultiIndexCV):
                # cross validators need to fold within each group of a multi index row index, a wrapper can be provided
                _log.warning(
                    "The Data provided uses a `MultiIndex` but the cross validation is not wrapped in "
                    "`PartitionedOnRowMultiIndexCV`")

            if epochs is None or epochs > 1:
                _log.warning(
                    f"using epochs > 1 together with cross folding may lead to different folds for each epoch!"
                    f"{cross_validation}")

            self.nr_folds = cross_validation.get_n_splits() if hasattr(
                cross_validation, "get_n_splits") else -1
            self.cross_validation = cross_validation.split if hasattr(
                cross_validation, "split") else cross_validation
        else:
            self.nr_folds = None
            self.cross_validation = None
예제 #9
0
    def to_dict(self, loc=None):
        d = {"x": self.x, "y": self.y, "weight": self.weight}
        if loc is not None:
            d = {k: loc_if_not_none(v, loc) for k, v in d.items()}

        return d
예제 #10
0
    def sample_for_training(self) -> Generator[FoldXYWeight, None, None]:
        cross_validation = self.cross_validation if self.cross_validation is not None else lambda x: [
            (None, None)
        ]

        # filter samples
        if self.filter is not None:
            train_idx = [
                idx for idx in self.train_idx if call_callable_dynamic_args(
                    self.filter, idx, **self.frames.to_dict(idx))
            ]
        else:
            train_idx = self.train_idx

        # update frame views
        train_frames = XYWeight(
            *[loc_if_not_none(f, train_idx) for f in self.frames])
        test_frames = XYWeight(
            *[loc_if_not_none(f, self.test_idx) for f in self.frames])

        # call for start ...
        call_callable_dynamic_args(
            self.on_start,
            epochs=self.epochs,
            batch_size=self.batch_size,
            fold_epochs=self.fold_epochs,
            features=exec_if_not_none(lambda x: x.columns.tolist(),
                                      self.frames.x),
            labels=exec_if_not_none(lambda y: y.columns.tolist(),
                                    self.frames.y),
            cross_validation=self.nr_folds is not None)

        # generate samples
        for epoch in (range(self.epochs) if self.epochs is not None else iter(
                int, 1)):
            call_callable_dynamic_args(self.on_epoch, epoch=epoch)
            fold_iter = enumerate(
                call_callable_dynamic_args(cross_validation, train_idx,
                                           **train_frames.to_dict()))
            for fold, (cv_train_i, cv_test_i) in fold_iter:
                call_callable_dynamic_args(self.on_fold,
                                           epoch=epoch,
                                           fold=fold)

                # if we dont have any cross validation the training and test sets stay unchanged
                cv_train_idx = train_idx if cv_train_i is None else train_idx[
                    cv_train_i]

                # build our test data sets
                if cv_test_i is not None:
                    if cv_test_i.ndim > 1:
                        cv_test_frames = [
                            XYWeight(*[
                                loc_if_not_none(f, train_idx[cv_test_i[:, i]])
                                for f in self.frames
                            ]) for i in range(cv_test_i.shape[1])
                        ]
                    else:
                        cv_test_frames = [
                            XYWeight(*[
                                loc_if_not_none(f, train_idx[cv_test_i])
                                for f in self.frames
                            ])
                        ]
                else:
                    if len(self.test_idx) <= 0:
                        cv_test_frames = []
                    else:
                        cv_test_frames = [
                            XYWeight(*[
                                loc_if_not_none(f, self.test_idx)
                                for f in self.frames
                            ])
                        ]

                for fold_epoch in range(self.fold_epochs):
                    call_callable_dynamic_args(self.on_fold,
                                               epoch=epoch,
                                               fold=fold,
                                               fold_epoch=fold_epoch)

                    # build our training data sets aka batches
                    cv_train_frames = XYWeight(*[
                        loc_if_not_none(f, cv_train_idx) for f in self.frames
                    ])

                    # theoretically we could already yield cv_train_frames, cv_test_frames
                    # but lets create batches first and then yield all together
                    nr_instances = len(cv_train_idx)
                    nice_i = max(nr_instances - 2, 0)
                    bs = min(nr_instances, self.batch_size
                             ) if self.batch_size is not None else nr_instances

                    batch_iter = range(0, nr_instances, bs)
                    for i in batch_iter:
                        call_callable_dynamic_args(self.on_batch,
                                                   epoch=epoch,
                                                   fold=fold,
                                                   fold_epoch=fold_epoch,
                                                   batch=i)
                        yield FoldXYWeight(
                            epoch, fold, fold_epoch,
                            *(f.iloc[i if i < nice_i else i - 1:i +
                                     bs] if f is not None else None
                              for f in cv_train_frames))
                        call_callable_dynamic_args(self.after_batch,
                                                   epoch=epoch,
                                                   fold=fold,
                                                   fold_epoch=fold_epoch,
                                                   batch=i)

                    # end of fold epoch
                    try:
                        call_callable_dynamic_args(self.after_fold_epoch,
                                                   epoch=epoch,
                                                   fold=fold,
                                                   fold_epoch=fold_epoch,
                                                   train_data=cv_train_frames,
                                                   test_data=cv_test_frames)
                    except StopIteration as sie:
                        call_callable_dynamic_args(self.after_fold,
                                                   epoch=epoch,
                                                   fold=fold,
                                                   train_data=cv_train_frames,
                                                   test_data=cv_test_frames)

                        if str(sie).isnumeric() and int(str(sie)) == fold:
                            # we just want to stop this fold
                            break
                        else:
                            # we need to stop any further generation of sample and call all left callbacks
                            call_callable_dynamic_args(self.after_epoch,
                                                       epoch=epoch,
                                                       train_data=train_frames,
                                                       test_data=test_frames)
                            call_callable_dynamic_args(self.after_end)
                            return
                # end of fold
                call_callable_dynamic_args(self.after_fold,
                                           epoch=epoch,
                                           fold=fold,
                                           train_data=cv_train_frames,
                                           test_data=cv_test_frames)
            # end of epoch
            call_callable_dynamic_args(self.after_epoch,
                                       epoch=epoch,
                                       train_data=train_frames,
                                       test_data=test_frames)
        # end of generator
        call_callable_dynamic_args(self.after_end)
예제 #11
0
    def test_loc_if_not_none(self):
        df1 = pd.DataFrame({"A": [1, 2, 3, 4]}, index=[1, 2, 3, 4])
        df2 = None

        self.assertEqual(1, loc_if_not_none(df1, 1).values[0])
        self.assertIsNone(loc_if_not_none(df2, 1))
예제 #12
0
def fit(df: pd.DataFrame,
        model_provider: Callable[[int], Model],
        test_size: float = 0.4,
        youngest_size: float = None,
        cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray],
                                              Tuple[np.ndarray,
                                                    np.ndarray]]] = None,
        test_validate_split_seed=42,
        hyper_parameter_space: Dict = None,
        **kwargs) -> Fit:
    """

    :param df: the DataFrame you apply this function to
    :param model_provider: a callable which provides a new :class:`.Model` instance i.e. for each hyper parameter if
                           hyper parameter tuning is enforced. Usually all the Model subclasses implement __call__
                           thus they are a provider of itself
    :param test_size: the fraction [0, 1] of random samples which are used for a test set
    :param youngest_size: the fraction [0, 1] of the test samples which are not random but are the youngest
    :param cross_validation: tuple of number of epochs for each fold provider and a cross validation provider
    :param test_validate_split_seed: seed if train, test splitting needs to be reproduceable. A magic seed 'youngest' is
                                     available, which just uses the youngest data as test data
    :param hyper_parameter_space: space of hyper parameters passed as kwargs to your model provider
    :return: returns a :class:`pandas_ml_utils.model.fitting.fit.Fit` object
    """

    trails = None
    model = model_provider()
    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                          kwargs)
    (features, min_required_samples), labels, targets, weights = \
        extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs)

    start_performance_count = perf_counter()
    _log.info("create model")

    # get indices and make training and test data sets
    train_idx, test_idx = train_test_split(features.index, test_size,
                                           youngest_size,
                                           test_validate_split_seed)
    train = (features.loc[train_idx], labels.loc[train_idx],
             loc_if_not_none(weights, train_idx))
    test = (features.loc[test_idx], labels.loc[test_idx],
            loc_if_not_none(weights, test_idx))

    # eventually perform a hyper parameter optimization first
    if hyper_parameter_space is not None:
        # next isolate hyperopt parameters and constants only used for hyper parameter tuning like early stopping
        constants = {}
        hyperopt_params = {}
        for k, v in list(hyper_parameter_space.items()):
            if k.startswith("__"):
                hyperopt_params[k[2:]] = hyper_parameter_space.pop(k)
            elif isinstance(v, (int, float, bool)):
                constants[k] = hyper_parameter_space.pop(k)

        # optimize hyper parameters
        model, trails = __hyper_opt(hyper_parameter_space, hyperopt_params,
                                    constants, model_provider,
                                    cross_validation, train, test)

    # finally train the model with eventually tuned hyper parameters
    __train_loop(model, cross_validation, train, test)
    _log.info(
        f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!"
    )

    # assemble result objects
    prediction_train = to_pandas(model.predict(train[0].ml.values), train_idx,
                                 labels.columns)
    prediction_test = to_pandas(model.predict(test[0].ml.values), test_idx,
                                labels.columns)

    targets = (loc_if_not_none(targets,
                               train_idx), loc_if_not_none(targets, test_idx))
    df_train = assemble_prediction_frame({
        TARGET_COLUMN_NAME: targets[0],
        PREDICTION_COLUMN_NAME: prediction_train,
        LABEL_COLUMN_NAME: train[1],
        FEATURE_COLUMN_NAME: train[0]
    })
    df_test = assemble_prediction_frame({
        TARGET_COLUMN_NAME: targets[1],
        PREDICTION_COLUMN_NAME: prediction_test,
        LABEL_COLUMN_NAME: test[1],
        FEATURE_COLUMN_NAME: test[0]
    })

    # update model properties and return the fit
    model._validation_indices = test_idx
    model.features_and_labels._min_required_samples = min_required_samples
    model.features_and_labels._label_columns = labels.columns
    return Fit(model, model.summary_provider(df_train),
               model.summary_provider(df_test), trails)