def test_optional(self):

        df_iris = load_iris()
        model_train = random_forest_regression_train(
            table=df_iris,
            feature_cols=['sepal_length', 'sepal_width', 'petal_length'],
            label_col='petal_width',
            n_estimators=20,
            criterion="mse",
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0,
            max_features="None",
            max_leaf_nodes=None,
            min_impurity_decrease=0,
            random_state=12345)['model']

        df_feature_importance = model_train['feature_importance_table']
        np.testing.assert_array_almost_equal(
            [0.0201313834, 0.0233862213, 0.9564823953],
            [df_feature_importance.values[i][1]
             for i in range(3)], 10, 'incorrect feature_importance')

        df_res = random_forest_regression_predict(
            table=df_iris, model=model_train,
            prediction_col='prediction')['out_table']

        np.testing.assert_array_almost_equal([
            0.24708333333333332, 0.19000000000000009, 0.20000000000000004,
            0.19166666666666674, 0.23875000000000002
        ], df_res['prediction'].values[:5], 10, 'incorrect prediction')
    def test_default(self):
        df_iris = load_iris()
        model_train = random_forest_regression_train(
            table=df_iris,
            feature_cols=['sepal_length', 'sepal_width'],
            label_col='petal_length',
            n_estimators=10,
            criterion="mse",
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0,
            max_features="None",
            max_leaf_nodes=None,
            min_impurity_decrease=0,
            random_state=12345)['model']

        df_feature_importance = model_train['feature_importance_table']
        np.testing.assert_array_almost_equal(
            [0.8419393152, 0.1580606848],
            [df_feature_importance.values[i][1]
             for i in range(2)], 10, 'incorrect feature_importance')

        df_res = random_forest_regression_predict(
            table=df_iris, model=model_train,
            prediction_col='prediction')['out_table']

        np.testing.assert_array_almost_equal(
            [1.3975, 1.4200000000000002, 1.446, 1.45, 1.41],
            df_res['prediction'].values[:5], 10, 'incorrect prediction')
Exemplo n.º 3
0
    def test_default(self):
        df_iris = load_iris()
        df_res = bartletts_test(table=df_iris,
                                response_cols=[
                                    'sepal_length', 'sepal_width',
                                    'petal_length', 'petal_width'
                                ],
                                factor_col='species')['result']['result_table']

        self.assertListEqual([
            'sepal_length by species', 'sepal_width by species',
            'petal_length by species', 'petal_width by species'
        ], df_res['data'].tolist(), 'incorrect data column')
        self.assertAlmostEqual(16.005701874401502,
                               df_res['estimate'].values[0], 10,
                               'sepal_length by species: incorrect estimate')
        self.assertAlmostEqual(2.2158125491551637,
                               df_res['estimate'].values[1], 10,
                               'sepal_length by species: incorrect estimate')
        self.assertAlmostEqual(8.904503355816222e-13,
                               df_res['p_value'].values[2], 10,
                               'petal_length by species: incorrect p_value')
        self.assertAlmostEqual(5.615311140767724e-09,
                               df_res['p_value'].values[3], 10,
                               'petal_width by species: incorrect p_value')
Exemplo n.º 4
0
    def test_default(self):
        df_iris = load_iris()
        df_res = one_sample_ttest(
            table=df_iris,
            input_cols=[
                'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
            ],
            alternatives=['Greater', 'Less', 'Two Sided'],
            hypothesized_mean=0,
            conf_level=0.95)['model']['result']

        np.testing.assert_array_almost_equal([86.4253746172] * 3,
                                             df_res['t_value'].values[:3], 10,
                                             'incorrect t-value')
        np.testing.assert_array_almost_equal(
            [2.1874883768175626e-129, 1., 4.374976753635125e-129],
            df_res['p_value'].values[3:6], 10, 'incorrect p-value')
        np.testing.assert_array_almost_equal(
            [3.5202193882, -np.inf, 3.4739936640],
            df_res['lower_confidence_interval'].values[6:9], 10,
            'incorrect lower confidence limit')
        np.testing.assert_array_almost_equal(
            [np.inf, 1.3018017244, 1.3217956315],
            df_res['upper_confidence_interval'].values[9:12], 10,
            'incorrect upper confidence limit')
Exemplo n.º 5
0
    def test_default(self):
        df_iris = load_iris()
        df_res = pivot2(table=df_iris,
                        values=['sepal_length', 'sepal_width'],
                        aggfunc=['mean', 'sum'],
                        index=['species'],
                        columns=None)['out_table']

        self.assertListEqual(['setosa', 'versicolor', 'virginica'],
                             df_res['species'].tolist(), 'incorrect species')
        self.assertAlmostEqual(5.006, df_res['mean_sepal_length'].values[0],
                               10, 'incorrect mean_sepal_length[0]')
        self.assertAlmostEqual(5.936, df_res['mean_sepal_length'].values[1],
                               10, 'incorrect mean_sepal_length[1]')
        self.assertAlmostEqual(6.587999999999998,
                               df_res['mean_sepal_length'].values[2], 10,
                               'incorrect mean_sepal_length[2]')
        self.assertAlmostEqual(170.9, df_res['sum_sepal_width'].values[0], 10,
                               'incorrect sum_sepal_width[0]')
        self.assertAlmostEqual(138.50000000000003,
                               df_res['sum_sepal_width'].values[1], 10,
                               'incorrect sum_sepal_width[1]')
        self.assertAlmostEqual(148.70000000000002,
                               df_res['sum_sepal_width'].values[2], 10,
                               'incorrect sum_sepal_width[2]')
Exemplo n.º 6
0
    def test_optional(self):
        df_iris = load_iris()
        model_train = decision_tree_regression_train(
            table=df_iris,
            feature_cols=['sepal_length', 'sepal_width'],
            label_col=['petal_length'],
            criterion='mae',
            splitter='best',
            max_depth=5,
            min_samples_split=2,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0.0,
            max_features=None,
            random_state=12345,
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            min_impurity_split=None,
            presort=False,
            sample_weight=None,
            check_input=True,
            X_idx_sorted=None)['model']
        df_res = decision_tree_regression_predict(
            table=df_iris,
            model=model_train,
            prediction_col='prediction',
            check_input=True)['out_table']

        np.testing.assert_array_almost_equal([1.5, 1.4, 1.5, 1.4, 1.5],
                                             df_res['prediction'].values[:5],
                                             10, 'incorrect prediction')
Exemplo n.º 7
0
    def test_optional(self):
        df_iris = load_iris()
        df_res = pivot2(table=df_iris,
                        values=['sepal_length'],
                        aggfunc=[
                            'max', 'min', 'var', 'std', 'count', 'median',
                            'q1', 'q3'
                        ],
                        index=None,
                        columns=['species'])['out_table']

        self.assertEqual('sepal_length', df_res['values'].values[0],
                         'incorrect values')
        self.assertAlmostEqual(5.8, df_res['max_setosa'].values[0], 10,
                               'incorrect max_setosa')
        self.assertAlmostEqual(4.3, df_res['min_setosa'].values[0], 10,
                               'incorrect min_setosa')
        self.assertAlmostEqual(0.12424897959183677,
                               df_res['var_setosa'].values[0], 10,
                               'incorrect var_setosa')
        self.assertAlmostEqual(0.3489469873777391,
                               df_res['std_setosa'].values[0], 10,
                               'incorrect std_setosa')
        self.assertAlmostEqual(50, df_res['count_setosa'].values[0], 10,
                               'incorrect count_setosa')
        self.assertAlmostEqual(5, df_res['median_setosa'].values[0], 10,
                               'incorrect median_setosa')
        self.assertAlmostEqual(4.8, df_res['q1_setosa'].values[0], 10,
                               'incorrect q1_setosa')
        self.assertAlmostEqual(5.2, df_res['q3_setosa'].values[0], 10,
                               'incorrect q3_setosa')
Exemplo n.º 8
0
 def test_default(self):
     df = load_iris()
     out_df = sort(df,
                   input_cols=['species', 'petal_length'],
                   is_asc=['desc', 'asc'])['out_table']
     print(df)
     print(out_df)
Exemplo n.º 9
0
 def setUp(self):
     print("*** n-way Anova UnitTest Start ***")
     self.iris = load_iris()
     self.example_df = pd.DataFrame({
         'Genotype': [
             'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D',
             'E', 'E', 'E', 'F', 'F', 'F', 'A', 'A', 'A', 'B', 'B', 'B',
             'C', 'C', 'C', 'D', 'D', 'D', 'E', 'E', 'E', 'F', 'F', 'F',
             'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D',
             'E', 'E', 'E', 'F', 'F', 'F'
         ],
         'years': [
             'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1',
             'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1',
             'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1',
             'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2',
             'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2',
             'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2',
             'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3',
             'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3',
             'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3'
         ],
         'value': [
             1.53, 1.83, 1.38, 3.6, 2.94, 4.02, 3.99, 3.3, 4.41, 3.75, 3.63,
             3.57, 1.71, 2.01, 2.04, 3.96, 4.77, 4.65, 4.08, 3.84, 3.96,
             5.7, 5.07, 7.2, 6.09, 5.88, 6.51, 5.19, 5.37, 5.55, 3.6, 5.1,
             6.99, 5.25, 5.28, 5.07, 6.69, 5.97, 6.33, 8.55, 7.95, 8.94,
             10.02, 9.63, 10.38, 11.4, 9.66, 10.53, 6.87, 6.93, 6.84, 9.84,
             9.87, 10.08
         ]
     })
Exemplo n.º 10
0
    def test_optional(self):
        df_iris = load_iris()
        df_train, df_test = train_test_split(df_iris, random_state=12345)
        df_res = knn_classification(
            train_table=df_train,
            test_table=df_test,
            feature_cols=['sepal_length', 'sepal_width', 'petal_length'],
            label_col='species',
            k=10,
            algorithm='auto',
            leaf_size=30,
            p=2)['out_table']

        self.assertListEqual(
            ['versicolor', 'setosa', 'versicolor', 'setosa', 'setosa'],
            df_res['prediction'].tolist()[:5], 'incorrect prediction')
        np.testing.assert_array_almost_equal(
            [0.0, 1.0, 0.0, 1.0, 1.0], df_res['probability_0'].values[:5], 10,
            'incorrect probability_0')
        np.testing.assert_array_almost_equal(
            [1.0, 0.0, 0.7, 0.0, 0.0], df_res['probability_1'].values[:5], 10,
            'incorrect probability_1')
        np.testing.assert_array_almost_equal(
            [0.0, 0.0, 0.3, 0.0, 0.0], df_res['probability_2'].values[:5], 10,
            'incorrect probability_2')
Exemplo n.º 11
0
 def test_optional(self):
     df_iris = load_iris()
     df_res = capitalize_variable(table=df_iris, input_cols=['species'], replace='lower', out_col_suffix=None)['out_table']
     
     self.assertEqual('setosa', df_res['species_lower'].values[0], 'setosa: incorrect lowercase')
     self.assertEqual('versicolor', df_res['species_lower'].values[50], 'versicolor: incorrect lowercase')
     self.assertEqual('virginica', df_res['species_lower'].values[100], 'virginica: incorrect lowercase')
Exemplo n.º 12
0
 def test_default(self):
     df_iris = load_iris()
     df_res = capitalize_variable(table=df_iris, input_cols=['species'], replace='upper', out_col_suffix=None)['out_table']
     
     self.assertEqual('SETOSA', df_res['species_upper'].values[0], 'setosa: incorrect uppercase')
     self.assertEqual('VERSICOLOR', df_res['species_upper'].values[50], 'versicolor: incorrect uppercase')
     self.assertEqual('VIRGINICA', df_res['species_upper'].values[100], 'virginica: incorrect uppercase')
Exemplo n.º 13
0
    def test_default(self):
        df_iris = load_iris()
        df_res = unpivot(table=df_iris,
                         value_vars=['sepal_length', 'sepal_width'],
                         var_name=None,
                         value_name='value',
                         col_level=None,
                         id_vars=None)['out_table']

        self.assertListEqual([1.4, 0.2, 'setosa', 'sepal_length', 5.1],
                             df_res.loc[0].tolist(),
                             'wrong result in the 1st row')
        self.assertListEqual([4.7, 1.4, 'versicolor', 'sepal_length', 7],
                             df_res.loc[50].tolist(),
                             'wrong result in the 51st row')
        self.assertListEqual([6, 2.5, 'virginica', 'sepal_length', 6.3],
                             df_res.loc[100].tolist(),
                             'wrong result in the 101st row')
        self.assertListEqual([1.4, 0.2, 'setosa', 'sepal_width', 3.5],
                             df_res.loc[150].tolist(),
                             'wrong result in the 151st row')
        self.assertListEqual([4.7, 1.4, 'versicolor', 'sepal_width', 3.2],
                             df_res.loc[200].tolist(),
                             'wrong result in the 201st row')
        self.assertListEqual([6, 2.5, 'virginica', 'sepal_width', 3.3],
                             df_res.loc[250].tolist(),
                             'wrong result in the 251st row')
Exemplo n.º 14
0
    def test_ratio(self):
        df_iris = load_iris()
        res = split_data(table=df_iris,
                         train_ratio=1.0,
                         test_ratio=1.0,
                         random_state=12345,
                         shuffle=True,
                         stratify=None)
        df_train = res['train_table']
        df_test = res['test_table']

        self.assertEqual(75, len(df_train), 'wrong size of train table')
        self.assertEqual(75, len(df_test), 'wrong size of test table')

        self.assertListEqual([5.0, 3.4, 1.6, 0.4, 'setosa'],
                             df_train.loc[0].tolist(),
                             'incorrect train data in the 1st row')
        self.assertListEqual([7.1, 3.0, 5.9, 2.1, 'virginica'],
                             df_train.loc[1].tolist(),
                             'incorrect train data in the 2nd row')
        self.assertListEqual([6.7, 3.1, 4.7, 1.5, 'versicolor'],
                             df_train.loc[2].tolist(),
                             'incorrect train data in the 3rd row')

        self.assertListEqual([5.6, 2.5, 3.9, 1.1, 'versicolor'],
                             df_test.loc[0].tolist(),
                             'incorrect train data in the 1st row')
        self.assertListEqual([4.4, 3.2, 1.3, 0.2, 'setosa'],
                             df_test.loc[1].tolist(),
                             'incorrect train data in the 2nd row')
        self.assertListEqual([6.3, 3.3, 4.7, 1.6, 'versicolor'],
                             df_test.loc[2].tolist(),
                             'incorrect train data in the 3rd row')
Exemplo n.º 15
0
    def test_validation(self):
        df = load_iris()
        with self.assertRaises(BrighticsFunctionException) as bfe:
            out_df = sort(df, input_cols=[], is_asc=['desc'])['out_table']

        test_errors = bfe.exception.errors
        self.assertTrue({'0033': ['input_cols']} in test_errors)
Exemplo n.º 16
0
    def test_default(self):
        df_iris = load_iris()
        df_res = add_row_number(table=df_iris,
                                new_col='add_row_number')['out_table']

        self.assertListEqual(list(range(10)),
                             df_res['add_row_number'].tolist()[:10],
                             'incorrect row index')
Exemplo n.º 17
0
 def test_groupby1(self):
     df = load_iris()
     train_out = xgb_regression_train(
         df,
         feature_cols=['sepal_length', 'sepal_width', 'petal_length'],
         label_col='petal_width',
         group_by=['species'])
     predict_out = xgb_regression_predict(df, train_out['model'])
Exemplo n.º 18
0
 def test3(self):
     # df = df_iris.copy().query(''' species != 'setosa' ''')
     df = load_iris()
     print(df)
     out = add_expression_column_if(
         df, 'encoded_species',
         ['''species == 'setosa' ''', '''species == 'virginica' '''],
         ['1.0', '2.0'], '0.0')['out_table']
     print(out['encoded_species'][48:102])
Exemplo n.º 19
0
 def test_frac_replace(self):
     df_iris = load_iris()
     df_res = random_sampling(table=df_iris, num_or_frac='frac', num=10, frac=50, replace=True, seed=12345)['table'].reset_index(drop=True)
     
     self.assertListEqual([4.7, 3.2, 1.6, 0.2, 'setosa'], df_res.loc[0].tolist(), 'incorrect sample[0]') 
     self.assertListEqual([7.2, 3.0, 5.8, 1.6, 'virginica'], df_res.loc[1].tolist(), 'incorrect sample[1]')  
     self.assertListEqual([6.2, 2.8, 4.8, 1.8, 'virginica'], df_res.loc[2].tolist(), 'incorrect sample[2]') 
     self.assertListEqual([5.8, 2.7, 5.1, 1.9, 'virginica'], df_res.loc[3].tolist(), 'incorrect sample[3]') 
     self.assertListEqual([4.9, 3.1, 1.5, 0.1, 'setosa'], df_res.loc[4].tolist(), 'incorrect sample[4]')
Exemplo n.º 20
0
 def test_kmeans_groupby1(self):
     df = load_iris()
     train_out = kmeans_train_predict(df,
                                      input_cols=[
                                          'sepal_length', 'sepal_width',
                                          'petal_length', 'petal_width'
                                      ],
                                      group_by=['species'])
     predict_out = kmeans_predict(df, train_out['model'])
Exemplo n.º 21
0
 def test_groupby1(self):
     df = load_iris()
     train_out = glm_train(
         df,
         feature_cols=['sepal_length', 'sepal_width', 'petal_length'],
         label_col='petal_width',
         group_by=['species'])
     predict_out = glm_predict(df, train_out['model'])
     print(predict_out['out_table'][['petal_width', 'prediction']])
Exemplo n.º 22
0
 def test_groupby1(self):
     df = load_iris()
     random_group = []
     for _ in range(len(df)):
         random_group.append(random.randint(1, 2))
     df['random_group'] = random_group
     
     train_out = xgb_classification_train(table=df, feature_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], label_col='species', group_by=['random_group'])
     predict_out = xgb_classification_predict(table=df, model=train_out['model'])
Exemplo n.º 23
0
 def test_groupby1(self):
     df = load_iris()
     random_group = []
     for i in range(len(df)):
         random_group.append(random.randint(1, 2))
     df['random_group'] = random_group
     
     train_out = logistic_regression_train(table=df, feature_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], label_col='species', group_by=['random_group'])
     predict_out = logistic_regression_predict(table=df, model=train_out['model'])
     print(predict_out['out_table'][['species', 'prediction']])
Exemplo n.º 24
0
 def test_predict_thresholds(self):
     iris = load_iris()
     
     df_splitted = split_data(table=iris, train_ratio=0.7, test_ratio=0.3)
     train_df = df_splitted['train_table']
     test_df = df_splitted['test_table']
     
     train_out = svm_classification_train(table=train_df, feature_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], label_col='species')
     
     predict_out = svm_classification_predict(table=test_df, model=train_out['model'], thresholds=[0.1, 0.2, 0.3])
Exemplo n.º 25
0
 def setUp(self):
     print("*** Spc ruleset AD with/without summary UnitTest Start ***")
     testdata = load_iris()
     setosa = testdata[testdata.species == 'setosa']
     versicolor = testdata[testdata.species == 'versicolor']
     versicolor_test = versicolor[versicolor.sepal_length > 6.5]
     testset = setosa.append(versicolor_test)
     testset = testset.reset_index(drop=True)
     testset['time'] = testset.index.astype(int)
     self.testdata = testset
Exemplo n.º 26
0
 def test_default(self):
     df_iris = load_iris()
     res = discretize_quantile(table=df_iris, input_col='sepal_length',
                                  num_of_buckets=2, out_col_name='bucket_number')
     df_res = res['out_table']
     model_res = res['model']
     
     self.assertListEqual([0, 1, 1, 1, 0, 1, 0, 1, 0, 1], df_res['bucket_number'].tolist()[49:59], 'incorrect quantization') 
     self.assertListEqual([0, 1], list(model_res['result']['bucket number']), 'incorrect bucket number')      
     self.assertListEqual(['[4.3, 5.8]', '(5.8, 7.9]'], list(model_res['result']['buckets']), 'incorrect buckets') 
     self.assertListEqual([80, 70], list(model_res['result']['count']), 'incorrect count')  
Exemplo n.º 27
0
def get_iris_randomgroup():
    df = load_iris()
    random_group1 = []
    random_group2 = []
    random_group2_map = {1: 'A', 2: 'B'}
    for i in range(len(df)):
        random_group1.append(random.randint(1, 2))
        random_group2.append(random_group2_map[random.randint(1, 2)])
    df['random_group1'] = random_group1
    df['random_group2'] = random_group2
    return df
Exemplo n.º 28
0
    def test_inner(self):
        df_iris = load_iris()
        df_res = join(left_table=df_iris,
                      right_table=df_iris,
                      left_on=['species', 'sepal_length'],
                      right_on=['species', 'petal_width'],
                      how='inner',
                      lsuffix='_left',
                      rsuffix='_right',
                      sort=False)['table']

        self.assertTrue(df_res.empty, 'inner: incorrect result')
Exemplo n.º 29
0
 def test1(self):
     iris = load_iris()
     
     df_splitted = split_data(table=iris, train_ratio=0.7, test_ratio=0.3)
     train_df = df_splitted['train_table']
     test_df = df_splitted['test_table']
     
     train_out = svm_classification_train(table=train_df, feature_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], label_col='species')
     # print(train_out['model']['svc_model'])
     
     predict_out = svm_classification_predict(table=test_df, model=train_out['model'])
     print(predict_out['out_table'][['species', 'prediction']])
Exemplo n.º 30
0
 def test_optional(self):
     df_iris = load_iris()
     df_res = one_sample_ttest(table=df_iris,
                               input_cols=['sepal_length'],
                               alternatives=['Less'],
                               hypothesized_mean=5.0,
                               conf_level=0.99)['model']['result']
           
     self.assertAlmostEqual(12.4732571467, df_res['t_value'].values[0], 10, 'incorrect t-value')
     self.assertAlmostEqual(1., df_res['p_value'].values[0], 10, 'incorrect p-value')
     self.assertAlmostEqual(-np.inf, df_res['lower_confidence_interval'].values[0], 10, 'incorrect lower confidence limit')
     self.assertAlmostEqual(6.0023304639 , df_res['upper_confidence_interval'].values[0], 10, 'incorrect upper confidence limit')