def test_ts_forecasting_lagged_data_operation(): train_input, predict_input, y_test = get_time_series() node_lagged = PrimaryNode('lagged') node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged]) chain = Chain(node_ridge) chain.fit_from_scratch(train_input) predicted_output = chain.predict(predict_input) predicted = np.ravel(predicted_output.predict) assert len(predicted) == len(np.ravel(y_test))
def test_ts_forecasting_smoothing_data_operation(): train_input, predict_input, y_test = get_time_series() for smoothing_operation in ['smoothing', 'gaussian_filter']: node_smoothing = PrimaryNode(smoothing_operation) node_lagged = SecondaryNode('lagged', nodes_from=[node_smoothing]) node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged]) chain = Chain(node_ridge) chain.fit_from_scratch(train_input) predicted_output = chain.predict(predict_input) predicted = np.ravel(predicted_output.predict) assert len(predicted) == len(np.ravel(y_test))
def test_classification_data_operations(): train_input, predict_input, y_test = get_small_classification_dataset() for data_operation in [ 'kernel_pca', 'pca', 'scaling', 'normalization', 'poly_features', 'rfe_lin_class', 'rfe_non_lin_class' ]: node_data_operation = PrimaryNode(data_operation) node_final = SecondaryNode('logit', nodes_from=[node_data_operation]) chain = Chain(node_final) # Fit and predict for chain chain.fit_from_scratch(train_input) predicted_output = chain.predict(predict_input) predicted = predicted_output.predict assert len(predicted) == len(y_test)
def test_data_merge_in_chain(): """ Test check is the chain can correctly work with dynamic changes in tables during the fit process """ # ridge # / \ (merge operation) # | ransac_lin_reg (remove several lines in table) # \ / # scaling node_scaling = PrimaryNode('scaling') node_lin_ransac = SecondaryNode('ransac_lin_reg', nodes_from=[node_scaling]) node_final = SecondaryNode('ridge', nodes_from=[node_lin_ransac, node_scaling]) chain = Chain(node_final) features_options = {'informative': 2, 'bias': 2.0} x_train, y_train, x_test, y_test = get_regression_dataset(features_options=features_options, samples_amount=100, features_amount=5) # Define regression task task = Task(TaskTypesEnum.regression) # Prepare data to train the model train_input = InputData(idx=np.arange(0, len(x_train)), features=x_train, target=y_train, task=task, data_type=DataTypesEnum.table) # Fit and predict chain.fit_from_scratch(train_input) prediction = chain.predict(train_input) print(prediction) assert prediction is not None
def synthetic_benchmark_dataset(samples_amount: int, features_amount: int, classes_amount: int = 2, features_options: Dict = DEFAULT_OPTIONS, fitted_chain: Chain = None) -> InputData: """ Generates a binary classification benchmark dataset that was obtained using the (TODO: add. reference) proposed fitting schema. :param samples_amount: Total amount of samples in the resulted dataset. :param features_amount: Total amount of features per sample. :param classes_amount: The amount of classes in the dataset. :param features_options: features options in key-value suitable for classification_dataset. :param fitted_chain: Chain with separately fitted models. If None then 3-level balanced tree were fitted and taken as a default. :return: Benchmark dataset that is ready to be used by Chain. """ if fitted_chain is None: fitted_chain = _default_chain(samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount) if classes_amount != 2: raise NotImplementedError( 'Only binary classification tasks are supported') features, target = classification_dataset( samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount, features_options=features_options) target = np.expand_dims(target, axis=1) task = Task(TaskTypesEnum.classification) samples_idxs = np.arange(0, samples_amount) train = InputData(idx=samples_idxs, features=features, target=target, task=task, data_type=DataTypesEnum.table) synth_target = fitted_chain.predict(input_data=train).predict synth_labels = _to_labels(synth_target) data_synth_train = InputData(idx=np.arange(0, samples_amount), features=features, target=synth_labels, task=task, data_type=DataTypesEnum.table) # TODO: fix preproc issues fitted_chain.fit_from_scratch(input_data=data_synth_train) features, target = classification_dataset( samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount, features_options=features_options) target = np.expand_dims(target, axis=1) test = InputData(idx=samples_idxs, features=features, target=target, data_type=DataTypesEnum.table, task=task) synth_target = fitted_chain.predict(input_data=test).predict synth_labels = _to_labels(synth_target) data_synth_final = InputData(idx=samples_idxs, features=features, data_type=DataTypesEnum.table, target=synth_labels, task=task) return data_synth_final