Пример #1
0
def test_ts_forecasting_lagged_data_operation():
    train_input, predict_input, y_test = get_time_series()

    node_lagged = PrimaryNode('lagged')
    node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged])
    chain = Chain(node_ridge)

    chain.fit_from_scratch(train_input)
    predicted_output = chain.predict(predict_input)
    predicted = np.ravel(predicted_output.predict)

    assert len(predicted) == len(np.ravel(y_test))
Пример #2
0
def test_ts_forecasting_smoothing_data_operation():
    train_input, predict_input, y_test = get_time_series()

    for smoothing_operation in ['smoothing', 'gaussian_filter']:
        node_smoothing = PrimaryNode(smoothing_operation)
        node_lagged = SecondaryNode('lagged', nodes_from=[node_smoothing])
        node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged])
        chain = Chain(node_ridge)

        chain.fit_from_scratch(train_input)
        predicted_output = chain.predict(predict_input)
        predicted = np.ravel(predicted_output.predict)

        assert len(predicted) == len(np.ravel(y_test))
Пример #3
0
def test_classification_data_operations():
    train_input, predict_input, y_test = get_small_classification_dataset()

    for data_operation in [
            'kernel_pca', 'pca', 'scaling', 'normalization', 'poly_features',
            'rfe_lin_class', 'rfe_non_lin_class'
    ]:
        node_data_operation = PrimaryNode(data_operation)
        node_final = SecondaryNode('logit', nodes_from=[node_data_operation])
        chain = Chain(node_final)

        # Fit and predict for chain
        chain.fit_from_scratch(train_input)
        predicted_output = chain.predict(predict_input)
        predicted = predicted_output.predict

        assert len(predicted) == len(y_test)
Пример #4
0
def test_data_merge_in_chain():
    """ Test check is the chain can correctly work with dynamic changes in
    tables during the fit process
    """

    #   ridge
    #  /     \ (merge operation)
    # |   ransac_lin_reg (remove several lines in table)
    #  \     /
    #  scaling

    node_scaling = PrimaryNode('scaling')

    node_lin_ransac = SecondaryNode('ransac_lin_reg', nodes_from=[node_scaling])
    node_final = SecondaryNode('ridge', nodes_from=[node_lin_ransac, node_scaling])
    chain = Chain(node_final)

    features_options = {'informative': 2, 'bias': 2.0}
    x_train, y_train, x_test, y_test = get_regression_dataset(features_options=features_options,
                                                              samples_amount=100,
                                                              features_amount=5)
    # Define regression task
    task = Task(TaskTypesEnum.regression)

    # Prepare data to train the model
    train_input = InputData(idx=np.arange(0, len(x_train)),
                            features=x_train,
                            target=y_train,
                            task=task,
                            data_type=DataTypesEnum.table)

    # Fit and predict
    chain.fit_from_scratch(train_input)
    prediction = chain.predict(train_input)

    print(prediction)
    assert prediction is not None
Пример #5
0
def synthetic_benchmark_dataset(samples_amount: int,
                                features_amount: int,
                                classes_amount: int = 2,
                                features_options: Dict = DEFAULT_OPTIONS,
                                fitted_chain: Chain = None) -> InputData:
    """
    Generates a binary classification benchmark dataset that was obtained using
    the (TODO: add. reference) proposed fitting schema.
    :param samples_amount: Total amount of samples in the resulted dataset.
    :param features_amount: Total amount of features per sample.
    :param classes_amount: The amount of classes in the dataset.
    :param features_options: features options in key-value suitable for classification_dataset.
    :param fitted_chain: Chain with separately fitted models.
    If None then 3-level balanced tree were fitted and taken as a default.
    :return: Benchmark dataset that is ready to be used by Chain.
    """
    if fitted_chain is None:
        fitted_chain = _default_chain(samples_amount=samples_amount,
                                      features_amount=features_amount,
                                      classes_amount=classes_amount)

    if classes_amount != 2:
        raise NotImplementedError(
            'Only binary classification tasks are supported')

    features, target = classification_dataset(
        samples_amount=samples_amount,
        features_amount=features_amount,
        classes_amount=classes_amount,
        features_options=features_options)
    target = np.expand_dims(target, axis=1)

    task = Task(TaskTypesEnum.classification)
    samples_idxs = np.arange(0, samples_amount)

    train = InputData(idx=samples_idxs,
                      features=features,
                      target=target,
                      task=task,
                      data_type=DataTypesEnum.table)

    synth_target = fitted_chain.predict(input_data=train).predict
    synth_labels = _to_labels(synth_target)
    data_synth_train = InputData(idx=np.arange(0, samples_amount),
                                 features=features,
                                 target=synth_labels,
                                 task=task,
                                 data_type=DataTypesEnum.table)

    # TODO: fix preproc issues

    fitted_chain.fit_from_scratch(input_data=data_synth_train)

    features, target = classification_dataset(
        samples_amount=samples_amount,
        features_amount=features_amount,
        classes_amount=classes_amount,
        features_options=features_options)
    target = np.expand_dims(target, axis=1)
    test = InputData(idx=samples_idxs,
                     features=features,
                     target=target,
                     data_type=DataTypesEnum.table,
                     task=task)
    synth_target = fitted_chain.predict(input_data=test).predict
    synth_labels = _to_labels(synth_target)
    data_synth_final = InputData(idx=samples_idxs,
                                 features=features,
                                 data_type=DataTypesEnum.table,
                                 target=synth_labels,
                                 task=task)

    return data_synth_final