def test_get_column_statistics(): column = pandas.Series([float('NaN'), '1', '2.0', float('NaN')]) stats_values = stats.get_column_statistics(column) assert set(stats_values.keys()) == { 'missing_values_count', 'format', 'format_specific_statistics', 'type' } or set(stats_values.keys()) == { 'missing_values_count', 'format', 'format_specific_statistics', 'type', 'uniques_count', 'uniques_stats' } column = pandas.Series(['a', 'cd', 'etx', 'a', '22', '22', 'n']) stats_values = stats.get_column_statistics(column) assert set(stats_values.keys()) == { 'missing_values_count', 'format', 'format_specific_statistics', 'type' } or set(stats_values.keys()) == { 'missing_values_count', 'format', 'format_specific_statistics', 'type', 'uniques_count', 'uniques_stats' }
def test_get_column_statistics_check_missings(): column = pandas.Series([float('NaN'), '1', '2.0', float('NaN')]) stats_values = stats.get_column_statistics(column) assert stats_values['missing_values_count'] == 2 column = pandas.Series(['0', '1', '2.0', '-1']) stats_values = stats.get_column_statistics(column) assert stats_values['missing_values_count'] == 0
def test_merge_text_column_partitions_stats(): column = pandas.Series([float('NaN'), 'a', '2.0', 'abc', 'ddddd', '2'], dtype=str) first_part_stats_values = stats.get_column_statistics(column[:3]) second_part_stats_values = stats.get_column_statistics(column[3:]) merged_text_stats = text_column.merge_text_column_partitions_stats( [first_part_stats_values, second_part_stats_values]) assert merged_text_stats == {'min_len': 1, 'max_len': 5}
def test_merge_column_partitions_stats(): column = pandas.Series( [float('NaN'), '1', '2.0', '3', float('NaN'), '2', float('NaN')], dtype=str ) first_part_stats_values = stats.get_column_statistics(column[:3]) second_part_stats_values = stats.get_column_statistics(column[3:]) merged_stats = merge.merge_column_partitions_stats( [first_part_stats_values, second_part_stats_values] ) uniques_stats = merged_stats.pop('uniques_stats') assert merged_stats == { 'format': DataFormats.numerical, 'missing_values_count': 3, 'type': DataTypes.categorical, 'uniques_count': 3, 'format_specific_statistics': { 'sum': 8.0, 'min': 1.0, 'max': 3, }, } assert all( uniques_stats.sort_index() == pandas.Series( [1.0, 2.0, 1.0], index=pandas.Index([1.0, 2.0, 3.0]) ).sort_index() ) column = pandas.Series( [float('NaN'), float('NaN'), float('NaN'), 'a', '2.0', 'abc', 'ddddd', '2'], dtype=str ) first_part_stats_values = stats.get_column_statistics(column[:3]) second_part_stats_values = stats.get_column_statistics(column[3:]) merged_stats = merge.merge_column_partitions_stats( [first_part_stats_values, second_part_stats_values] ) uniques_stats = merged_stats.pop('uniques_stats') assert merged_stats == { 'format': DataFormats.character, 'missing_values_count': 3, 'type': DataTypes.categorical, 'uniques_count': 5, 'format_specific_statistics': { 'min_len': 1, 'max_len': 5 }, } assert all( uniques_stats.sort_index() == pandas.Series( [1.0, 1.0, 1.0, 1.0, 1.0], index=pandas.Index(['2', '2.0', 'a', 'abc', 'ddddd']) ).sort_index() )
def test_merge_numerical_column_partitions_stats(): column = pandas.Series([float('NaN'), '1', '2.0', '3', '2', float('NaN')], dtype=str) first_part_stats_values = stats.get_column_statistics(column[:3]) second_part_stats_values = stats.get_column_statistics(column[3:]) merged_numerical_stats = numerical_column.merge_numerical_column_partitions_stats( [first_part_stats_values, second_part_stats_values]) assert merged_numerical_stats == {'min': 1, 'max': 3, 'sum': 8.0}
def test_merge_column_missing_values_count(): column = pandas.Series( [float('NaN'), '1', '2.0', '3', float('NaN'), '2', float('NaN')], dtype=str ) first_part_stats_values = stats.get_column_statistics(column[:3]) second_part_stats_values = stats.get_column_statistics(column[3:]) assert merge.merge_column_missing_values_count( [first_part_stats_values, second_part_stats_values] ) == 3
def test_get_column_statistics_check_format(): column = pandas.Series(['0', '1', '2', '-1']) stats_values = stats.get_column_statistics(column) assert stats_values['format'] is DataFormats.numerical column = pandas.Series(['0.8', '1', '2.0', '0']) stats_values = stats.get_column_statistics(column) assert stats_values['format'] is DataFormats.numerical column = pandas.Series(['s', '1', '2.0', float('NaN')]) stats_values = stats.get_column_statistics(column) assert stats_values['format'] is DataFormats.character
def test_merge_text_column_min_len_max_len(): column = pandas.Series(['a', '2.0', 'abc', 'ddddd', '2', float('NaN')], dtype=str) first_part_stats_values = stats.get_column_statistics(column[:3]) second_part_stats_values = stats.get_column_statistics(column[3:]) merged_min_len, merged_max_len = text_column.merge_text_column_min_len_max_len( [ first_part_stats_values['format_specific_statistics'], second_part_stats_values['format_specific_statistics'] ]) assert merged_min_len == 1 and merged_max_len == 5
def test_merge_column_uniques_count(): first_column_part_stats = stats.get_column_statistics( pandas.Series([float('NaN'), '1', '2.0', float('NaN'), '3'], dtype=str)) second_column_part_stats = stats.get_column_statistics( pandas.Series( [float('NaN'), '2', '3', float('NaN')], dtype=str)) uniques_data = categorical_column.merge_column_uniques_count( [first_column_part_stats, second_column_part_stats]) uniques_count = uniques_data['uniques_count'] uniques_stats = uniques_data['uniques_stats'] assert uniques_count == 3 assert all(uniques_stats.sort_index() == pandas.Series( [1.0, 2.0, 2.0], index=[1.0, 2.0, 3.0]).sort_index())
def test_get_column_statistics_numeric_specific_statistics_sum(): column = pandas.Series(['0.8', '1', '2', '-1']) stats_values = stats.get_column_statistics(column) assert stats_values['format_specific_statistics']['sum'] == numpy.float32(2.8)
def test_get_column_statistics_numeric_specific_statistics_min_max(): column = pandas.Series(['0.8', '1', '2', '-1']) stats_values = stats.get_column_statistics(column) assert stats_values['format_specific_statistics']['min'] == -1 assert stats_values['format_specific_statistics']['max'] == 2
def test_get_column_statistics_detect_categorical(): column = pandas.Series(['a', 'cd', 'etx', 'a', '22', '22', 'n']) stats_values = stats.get_column_statistics(column) assert stats_values['type'] is DataTypes.categorical assert stats_values['uniques_count'] == 5
def test_get_column_statistics_detect_continuous(): column = pandas.Series(numpy.arange(1002).astype(str)) stats_values = stats.get_column_statistics(column) assert stats_values['type'] is DataTypes.continuous
def test_get_column_statistics_text_specific_statistics_min_len_max_len(): column = pandas.Series(['a', float('NaN'), 'etx09', 'a', '22', '22', 'n']) stats_values = stats.get_column_statistics(column) assert stats_values['format_specific_statistics']['min_len'] == 1 assert stats_values['format_specific_statistics']['max_len'] == 5