def test_analyze_dataset_empty_column(self): predictor = Predictor(name='test') n_points = 100 input_dataframe = pd.DataFrame({ 'numeric_int': list(range(n_points)), 'empty_column': [None for i in range(n_points)] }, index=list(range(n_points))) model_data = predictor.analyse_dataset(from_data=input_dataframe) assert model_data['data_analysis_v2']['empty_column']['empty']['is_empty'] is True
def test_analyze_dataset_empty_values(self): predictor = Predictor(name='test') n_points = 100 input_dataframe = pd.DataFrame({ 'numeric_int': list(range(n_points)), }, index=list(range(n_points))) input_dataframe['numeric_int'].iloc[::2] = None model_data = predictor.analyse_dataset(from_data=input_dataframe) assert model_data['data_analysis_v2']['numeric_int']['empty']['empty_percentage'] == 50
def test_analyze_dataset(self): predictor = Predictor(name='test') n_points = 100 n_category_values = 4 input_dataframe = pd.DataFrame( { 'numeric_int': list(range(n_points)), 'numeric_float': np.linspace(0, n_points, n_points), 'date_timestamp': [ (datetime.now() - timedelta(minutes=int(i))).isoformat() for i in range(n_points) ], 'date_date': [ (datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d') for i in range(n_points) ], 'categorical_str': [ f'a{x}' for x in (list(range(n_category_values)) * (n_points // n_category_values)) ], 'categorical_int': [ x for x in (list(range(n_category_values)) * (n_points // n_category_values)) ], 'categorical_binary': [0, 1] * (n_points // 2), 'sequential_array': [f"1,2,3,4,5,{i}" for i in range(n_points)], 'sequential_text': [f'lorem ipsum long text {i}' for i in range(n_points)], }, index=list(range(n_points))) model_data = predictor.analyse_dataset(from_data=input_dataframe) for col, col_data in model_data['data_analysis_v2'].items(): expected_type = test_column_types[col][0] expected_subtype = test_column_types[col][1] assert col_data['typing']['data_type'] == expected_type assert col_data['typing']['data_subtype'] == expected_subtype assert col_data['empty'] assert col_data['histogram'] assert 'percentage_buckets' in col_data assert 'nr_warnings' in col_data assert not col_data['is_foreign_key']