def test_optional(self): df_iris = load_iris() model_train = random_forest_regression_train( table=df_iris, feature_cols=['sepal_length', 'sepal_width', 'petal_length'], label_col='petal_width', n_estimators=20, criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_features="None", max_leaf_nodes=None, min_impurity_decrease=0, random_state=12345)['model'] df_feature_importance = model_train['feature_importance_table'] np.testing.assert_array_almost_equal( [0.0201313834, 0.0233862213, 0.9564823953], [df_feature_importance.values[i][1] for i in range(3)], 10, 'incorrect feature_importance') df_res = random_forest_regression_predict( table=df_iris, model=model_train, prediction_col='prediction')['out_table'] np.testing.assert_array_almost_equal([ 0.24708333333333332, 0.19000000000000009, 0.20000000000000004, 0.19166666666666674, 0.23875000000000002 ], df_res['prediction'].values[:5], 10, 'incorrect prediction')
def test_default(self): df_iris = load_iris() model_train = random_forest_regression_train( table=df_iris, feature_cols=['sepal_length', 'sepal_width'], label_col='petal_length', n_estimators=10, criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_features="None", max_leaf_nodes=None, min_impurity_decrease=0, random_state=12345)['model'] df_feature_importance = model_train['feature_importance_table'] np.testing.assert_array_almost_equal( [0.8419393152, 0.1580606848], [df_feature_importance.values[i][1] for i in range(2)], 10, 'incorrect feature_importance') df_res = random_forest_regression_predict( table=df_iris, model=model_train, prediction_col='prediction')['out_table'] np.testing.assert_array_almost_equal( [1.3975, 1.4200000000000002, 1.446, 1.45, 1.41], df_res['prediction'].values[:5], 10, 'incorrect prediction')
def test_default(self): df_iris = load_iris() df_res = bartletts_test(table=df_iris, response_cols=[ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width' ], factor_col='species')['result']['result_table'] self.assertListEqual([ 'sepal_length by species', 'sepal_width by species', 'petal_length by species', 'petal_width by species' ], df_res['data'].tolist(), 'incorrect data column') self.assertAlmostEqual(16.005701874401502, df_res['estimate'].values[0], 10, 'sepal_length by species: incorrect estimate') self.assertAlmostEqual(2.2158125491551637, df_res['estimate'].values[1], 10, 'sepal_length by species: incorrect estimate') self.assertAlmostEqual(8.904503355816222e-13, df_res['p_value'].values[2], 10, 'petal_length by species: incorrect p_value') self.assertAlmostEqual(5.615311140767724e-09, df_res['p_value'].values[3], 10, 'petal_width by species: incorrect p_value')
def test_default(self): df_iris = load_iris() df_res = one_sample_ttest( table=df_iris, input_cols=[ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width' ], alternatives=['Greater', 'Less', 'Two Sided'], hypothesized_mean=0, conf_level=0.95)['model']['result'] np.testing.assert_array_almost_equal([86.4253746172] * 3, df_res['t_value'].values[:3], 10, 'incorrect t-value') np.testing.assert_array_almost_equal( [2.1874883768175626e-129, 1., 4.374976753635125e-129], df_res['p_value'].values[3:6], 10, 'incorrect p-value') np.testing.assert_array_almost_equal( [3.5202193882, -np.inf, 3.4739936640], df_res['lower_confidence_interval'].values[6:9], 10, 'incorrect lower confidence limit') np.testing.assert_array_almost_equal( [np.inf, 1.3018017244, 1.3217956315], df_res['upper_confidence_interval'].values[9:12], 10, 'incorrect upper confidence limit')
def test_default(self): df_iris = load_iris() df_res = pivot2(table=df_iris, values=['sepal_length', 'sepal_width'], aggfunc=['mean', 'sum'], index=['species'], columns=None)['out_table'] self.assertListEqual(['setosa', 'versicolor', 'virginica'], df_res['species'].tolist(), 'incorrect species') self.assertAlmostEqual(5.006, df_res['mean_sepal_length'].values[0], 10, 'incorrect mean_sepal_length[0]') self.assertAlmostEqual(5.936, df_res['mean_sepal_length'].values[1], 10, 'incorrect mean_sepal_length[1]') self.assertAlmostEqual(6.587999999999998, df_res['mean_sepal_length'].values[2], 10, 'incorrect mean_sepal_length[2]') self.assertAlmostEqual(170.9, df_res['sum_sepal_width'].values[0], 10, 'incorrect sum_sepal_width[0]') self.assertAlmostEqual(138.50000000000003, df_res['sum_sepal_width'].values[1], 10, 'incorrect sum_sepal_width[1]') self.assertAlmostEqual(148.70000000000002, df_res['sum_sepal_width'].values[2], 10, 'incorrect sum_sepal_width[2]')
def test_optional(self): df_iris = load_iris() model_train = decision_tree_regression_train( table=df_iris, feature_cols=['sepal_length', 'sepal_width'], label_col=['petal_length'], criterion='mae', splitter='best', max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=12345, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None)['model'] df_res = decision_tree_regression_predict( table=df_iris, model=model_train, prediction_col='prediction', check_input=True)['out_table'] np.testing.assert_array_almost_equal([1.5, 1.4, 1.5, 1.4, 1.5], df_res['prediction'].values[:5], 10, 'incorrect prediction')
def test_optional(self): df_iris = load_iris() df_res = pivot2(table=df_iris, values=['sepal_length'], aggfunc=[ 'max', 'min', 'var', 'std', 'count', 'median', 'q1', 'q3' ], index=None, columns=['species'])['out_table'] self.assertEqual('sepal_length', df_res['values'].values[0], 'incorrect values') self.assertAlmostEqual(5.8, df_res['max_setosa'].values[0], 10, 'incorrect max_setosa') self.assertAlmostEqual(4.3, df_res['min_setosa'].values[0], 10, 'incorrect min_setosa') self.assertAlmostEqual(0.12424897959183677, df_res['var_setosa'].values[0], 10, 'incorrect var_setosa') self.assertAlmostEqual(0.3489469873777391, df_res['std_setosa'].values[0], 10, 'incorrect std_setosa') self.assertAlmostEqual(50, df_res['count_setosa'].values[0], 10, 'incorrect count_setosa') self.assertAlmostEqual(5, df_res['median_setosa'].values[0], 10, 'incorrect median_setosa') self.assertAlmostEqual(4.8, df_res['q1_setosa'].values[0], 10, 'incorrect q1_setosa') self.assertAlmostEqual(5.2, df_res['q3_setosa'].values[0], 10, 'incorrect q3_setosa')
def test_default(self): df = load_iris() out_df = sort(df, input_cols=['species', 'petal_length'], is_asc=['desc', 'asc'])['out_table'] print(df) print(out_df)
def setUp(self): print("*** n-way Anova UnitTest Start ***") self.iris = load_iris() self.example_df = pd.DataFrame({ 'Genotype': [ 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D', 'E', 'E', 'E', 'F', 'F', 'F', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D', 'E', 'E', 'E', 'F', 'F', 'F', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D', 'E', 'E', 'E', 'F', 'F', 'F' ], 'years': [ 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_1', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_2', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3', 'year_3' ], 'value': [ 1.53, 1.83, 1.38, 3.6, 2.94, 4.02, 3.99, 3.3, 4.41, 3.75, 3.63, 3.57, 1.71, 2.01, 2.04, 3.96, 4.77, 4.65, 4.08, 3.84, 3.96, 5.7, 5.07, 7.2, 6.09, 5.88, 6.51, 5.19, 5.37, 5.55, 3.6, 5.1, 6.99, 5.25, 5.28, 5.07, 6.69, 5.97, 6.33, 8.55, 7.95, 8.94, 10.02, 9.63, 10.38, 11.4, 9.66, 10.53, 6.87, 6.93, 6.84, 9.84, 9.87, 10.08 ] })
def test_optional(self): df_iris = load_iris() df_train, df_test = train_test_split(df_iris, random_state=12345) df_res = knn_classification( train_table=df_train, test_table=df_test, feature_cols=['sepal_length', 'sepal_width', 'petal_length'], label_col='species', k=10, algorithm='auto', leaf_size=30, p=2)['out_table'] self.assertListEqual( ['versicolor', 'setosa', 'versicolor', 'setosa', 'setosa'], df_res['prediction'].tolist()[:5], 'incorrect prediction') np.testing.assert_array_almost_equal( [0.0, 1.0, 0.0, 1.0, 1.0], df_res['probability_0'].values[:5], 10, 'incorrect probability_0') np.testing.assert_array_almost_equal( [1.0, 0.0, 0.7, 0.0, 0.0], df_res['probability_1'].values[:5], 10, 'incorrect probability_1') np.testing.assert_array_almost_equal( [0.0, 0.0, 0.3, 0.0, 0.0], df_res['probability_2'].values[:5], 10, 'incorrect probability_2')
def test_optional(self): df_iris = load_iris() df_res = capitalize_variable(table=df_iris, input_cols=['species'], replace='lower', out_col_suffix=None)['out_table'] self.assertEqual('setosa', df_res['species_lower'].values[0], 'setosa: incorrect lowercase') self.assertEqual('versicolor', df_res['species_lower'].values[50], 'versicolor: incorrect lowercase') self.assertEqual('virginica', df_res['species_lower'].values[100], 'virginica: incorrect lowercase')
def test_default(self): df_iris = load_iris() df_res = capitalize_variable(table=df_iris, input_cols=['species'], replace='upper', out_col_suffix=None)['out_table'] self.assertEqual('SETOSA', df_res['species_upper'].values[0], 'setosa: incorrect uppercase') self.assertEqual('VERSICOLOR', df_res['species_upper'].values[50], 'versicolor: incorrect uppercase') self.assertEqual('VIRGINICA', df_res['species_upper'].values[100], 'virginica: incorrect uppercase')
def test_default(self): df_iris = load_iris() df_res = unpivot(table=df_iris, value_vars=['sepal_length', 'sepal_width'], var_name=None, value_name='value', col_level=None, id_vars=None)['out_table'] self.assertListEqual([1.4, 0.2, 'setosa', 'sepal_length', 5.1], df_res.loc[0].tolist(), 'wrong result in the 1st row') self.assertListEqual([4.7, 1.4, 'versicolor', 'sepal_length', 7], df_res.loc[50].tolist(), 'wrong result in the 51st row') self.assertListEqual([6, 2.5, 'virginica', 'sepal_length', 6.3], df_res.loc[100].tolist(), 'wrong result in the 101st row') self.assertListEqual([1.4, 0.2, 'setosa', 'sepal_width', 3.5], df_res.loc[150].tolist(), 'wrong result in the 151st row') self.assertListEqual([4.7, 1.4, 'versicolor', 'sepal_width', 3.2], df_res.loc[200].tolist(), 'wrong result in the 201st row') self.assertListEqual([6, 2.5, 'virginica', 'sepal_width', 3.3], df_res.loc[250].tolist(), 'wrong result in the 251st row')
def test_ratio(self): df_iris = load_iris() res = split_data(table=df_iris, train_ratio=1.0, test_ratio=1.0, random_state=12345, shuffle=True, stratify=None) df_train = res['train_table'] df_test = res['test_table'] self.assertEqual(75, len(df_train), 'wrong size of train table') self.assertEqual(75, len(df_test), 'wrong size of test table') self.assertListEqual([5.0, 3.4, 1.6, 0.4, 'setosa'], df_train.loc[0].tolist(), 'incorrect train data in the 1st row') self.assertListEqual([7.1, 3.0, 5.9, 2.1, 'virginica'], df_train.loc[1].tolist(), 'incorrect train data in the 2nd row') self.assertListEqual([6.7, 3.1, 4.7, 1.5, 'versicolor'], df_train.loc[2].tolist(), 'incorrect train data in the 3rd row') self.assertListEqual([5.6, 2.5, 3.9, 1.1, 'versicolor'], df_test.loc[0].tolist(), 'incorrect train data in the 1st row') self.assertListEqual([4.4, 3.2, 1.3, 0.2, 'setosa'], df_test.loc[1].tolist(), 'incorrect train data in the 2nd row') self.assertListEqual([6.3, 3.3, 4.7, 1.6, 'versicolor'], df_test.loc[2].tolist(), 'incorrect train data in the 3rd row')
def test_validation(self): df = load_iris() with self.assertRaises(BrighticsFunctionException) as bfe: out_df = sort(df, input_cols=[], is_asc=['desc'])['out_table'] test_errors = bfe.exception.errors self.assertTrue({'0033': ['input_cols']} in test_errors)
def test_default(self): df_iris = load_iris() df_res = add_row_number(table=df_iris, new_col='add_row_number')['out_table'] self.assertListEqual(list(range(10)), df_res['add_row_number'].tolist()[:10], 'incorrect row index')
def test_groupby1(self): df = load_iris() train_out = xgb_regression_train( df, feature_cols=['sepal_length', 'sepal_width', 'petal_length'], label_col='petal_width', group_by=['species']) predict_out = xgb_regression_predict(df, train_out['model'])
def test3(self): # df = df_iris.copy().query(''' species != 'setosa' ''') df = load_iris() print(df) out = add_expression_column_if( df, 'encoded_species', ['''species == 'setosa' ''', '''species == 'virginica' '''], ['1.0', '2.0'], '0.0')['out_table'] print(out['encoded_species'][48:102])
def test_frac_replace(self): df_iris = load_iris() df_res = random_sampling(table=df_iris, num_or_frac='frac', num=10, frac=50, replace=True, seed=12345)['table'].reset_index(drop=True) self.assertListEqual([4.7, 3.2, 1.6, 0.2, 'setosa'], df_res.loc[0].tolist(), 'incorrect sample[0]') self.assertListEqual([7.2, 3.0, 5.8, 1.6, 'virginica'], df_res.loc[1].tolist(), 'incorrect sample[1]') self.assertListEqual([6.2, 2.8, 4.8, 1.8, 'virginica'], df_res.loc[2].tolist(), 'incorrect sample[2]') self.assertListEqual([5.8, 2.7, 5.1, 1.9, 'virginica'], df_res.loc[3].tolist(), 'incorrect sample[3]') self.assertListEqual([4.9, 3.1, 1.5, 0.1, 'setosa'], df_res.loc[4].tolist(), 'incorrect sample[4]')
def test_kmeans_groupby1(self): df = load_iris() train_out = kmeans_train_predict(df, input_cols=[ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width' ], group_by=['species']) predict_out = kmeans_predict(df, train_out['model'])
def test_groupby1(self): df = load_iris() train_out = glm_train( df, feature_cols=['sepal_length', 'sepal_width', 'petal_length'], label_col='petal_width', group_by=['species']) predict_out = glm_predict(df, train_out['model']) print(predict_out['out_table'][['petal_width', 'prediction']])
def test_groupby1(self): df = load_iris() random_group = [] for _ in range(len(df)): random_group.append(random.randint(1, 2)) df['random_group'] = random_group train_out = xgb_classification_train(table=df, feature_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], label_col='species', group_by=['random_group']) predict_out = xgb_classification_predict(table=df, model=train_out['model'])
def test_groupby1(self): df = load_iris() random_group = [] for i in range(len(df)): random_group.append(random.randint(1, 2)) df['random_group'] = random_group train_out = logistic_regression_train(table=df, feature_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], label_col='species', group_by=['random_group']) predict_out = logistic_regression_predict(table=df, model=train_out['model']) print(predict_out['out_table'][['species', 'prediction']])
def test_predict_thresholds(self): iris = load_iris() df_splitted = split_data(table=iris, train_ratio=0.7, test_ratio=0.3) train_df = df_splitted['train_table'] test_df = df_splitted['test_table'] train_out = svm_classification_train(table=train_df, feature_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], label_col='species') predict_out = svm_classification_predict(table=test_df, model=train_out['model'], thresholds=[0.1, 0.2, 0.3])
def setUp(self): print("*** Spc ruleset AD with/without summary UnitTest Start ***") testdata = load_iris() setosa = testdata[testdata.species == 'setosa'] versicolor = testdata[testdata.species == 'versicolor'] versicolor_test = versicolor[versicolor.sepal_length > 6.5] testset = setosa.append(versicolor_test) testset = testset.reset_index(drop=True) testset['time'] = testset.index.astype(int) self.testdata = testset
def test_default(self): df_iris = load_iris() res = discretize_quantile(table=df_iris, input_col='sepal_length', num_of_buckets=2, out_col_name='bucket_number') df_res = res['out_table'] model_res = res['model'] self.assertListEqual([0, 1, 1, 1, 0, 1, 0, 1, 0, 1], df_res['bucket_number'].tolist()[49:59], 'incorrect quantization') self.assertListEqual([0, 1], list(model_res['result']['bucket number']), 'incorrect bucket number') self.assertListEqual(['[4.3, 5.8]', '(5.8, 7.9]'], list(model_res['result']['buckets']), 'incorrect buckets') self.assertListEqual([80, 70], list(model_res['result']['count']), 'incorrect count')
def get_iris_randomgroup(): df = load_iris() random_group1 = [] random_group2 = [] random_group2_map = {1: 'A', 2: 'B'} for i in range(len(df)): random_group1.append(random.randint(1, 2)) random_group2.append(random_group2_map[random.randint(1, 2)]) df['random_group1'] = random_group1 df['random_group2'] = random_group2 return df
def test_inner(self): df_iris = load_iris() df_res = join(left_table=df_iris, right_table=df_iris, left_on=['species', 'sepal_length'], right_on=['species', 'petal_width'], how='inner', lsuffix='_left', rsuffix='_right', sort=False)['table'] self.assertTrue(df_res.empty, 'inner: incorrect result')
def test1(self): iris = load_iris() df_splitted = split_data(table=iris, train_ratio=0.7, test_ratio=0.3) train_df = df_splitted['train_table'] test_df = df_splitted['test_table'] train_out = svm_classification_train(table=train_df, feature_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], label_col='species') # print(train_out['model']['svc_model']) predict_out = svm_classification_predict(table=test_df, model=train_out['model']) print(predict_out['out_table'][['species', 'prediction']])
def test_optional(self): df_iris = load_iris() df_res = one_sample_ttest(table=df_iris, input_cols=['sepal_length'], alternatives=['Less'], hypothesized_mean=5.0, conf_level=0.99)['model']['result'] self.assertAlmostEqual(12.4732571467, df_res['t_value'].values[0], 10, 'incorrect t-value') self.assertAlmostEqual(1., df_res['p_value'].values[0], 10, 'incorrect p-value') self.assertAlmostEqual(-np.inf, df_res['lower_confidence_interval'].values[0], 10, 'incorrect lower confidence limit') self.assertAlmostEqual(6.0023304639 , df_res['upper_confidence_interval'].values[0], 10, 'incorrect upper confidence limit')