def test_stats_gen_with_header_in_empty_csv_file(self): input_data_path = self._write_records_to_csv([], self._get_temp_dir(), 'input_data.csv') with self.assertRaisesRegexp( ValueError, 'Found empty file when reading the header.*'): _ = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, column_names=None, delimiter=',')
def test_stats_gen_with_csv_tab_delimiter_no_header_in_file(self): records, header, expected_result = self._get_csv_test(delimiter='\t', with_header=False) input_data_path = self._write_records_to_csv(records, self._get_temp_dir(), 'input_data.tsv') result = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, column_names=header, delimiter='\t', stats_options=self._default_stats_options) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])
def test_stats_gen_with_invalid_csv_header_in_multiple_files(self): records, _, _ = self._get_csv_test(delimiter=',', with_header=True) header = records.pop(0) # Split the records into two subsets and write to separate files. records1 = [header] + records[0:3] records2 = ['random,header'] + records[3:] tmp_dir = self._get_temp_dir() self._write_records_to_csv(records1, tmp_dir, 'input_data1.csv') self._write_records_to_csv(records2, tmp_dir, 'input_data2.csv') input_data_path = tmp_dir + '/input_data*' with self.assertRaisesRegexp( ValueError, 'Files have different headers.'): _ = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, column_names=None, delimiter=',')
def test_stats_gen_with_csv_missing_column(self): records = [',', ','] input_data_path = self._write_records_to_csv(records, self._get_temp_dir(), 'input_data.csv') expected_result = text_format.Parse( """ datasets { num_examples: 2 features { path { step: "feature1" } type: STRING string_stats { common_stats { num_missing: 2 } } } features { path { step: "feature2" } type: STRING string_stats { common_stats { num_missing: 2 } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) result = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, column_names=['feature1', 'feature2'], delimiter=',', stats_options=self._default_stats_options) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])
def test_stats_gen_with_csv_header_in_multiple_files(self): records, _, expected_result = self._get_csv_test(delimiter=',', with_header=True) header = records.pop(0) # Split the records into two subsets and write to separate files. records1 = [header] + records[0:3] records2 = [header] + records[3:] tmp_dir = self._get_temp_dir() self._write_records_to_csv(records1, tmp_dir, 'input_data1.csv') self._write_records_to_csv(records2, tmp_dir, 'input_data2.csv') input_data_path = os.path.join(tmp_dir, 'input_data*') result = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, column_names=None, delimiter=',', stats_options=self._default_stats_options) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])
def test_stats_gen_with_csv_no_header_in_file(self, compression_type): records, header, expected_result = self._get_csv_test(delimiter=',', with_header=False) compression_type_lookup = { CompressionTypes.AUTO: '', CompressionTypes.GZIP: 'gzip' } input_data_path = self._write_records_to_csv( records, self._get_temp_dir(), 'input_data.csv', compression_type=compression_type_lookup[compression_type]) result = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, column_names=header, delimiter=',', stats_options=self._default_stats_options, compression_type=compression_type) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])
def test_stats_gen_with_csv_with_schema(self): records = ['feature1', '1'] input_data_path = self._write_records_to_csv(records, self._get_temp_dir(), 'input_data.csv') schema = text_format.Parse( """ feature { name: "feature1" type: BYTES } """, schema_pb2.Schema()) expected_result = text_format.Parse( """ datasets { num_examples: 1 features { path { step: "feature1" } type: STRING string_stats { common_stats { num_non_missing: 1 min_num_values: 1 max_num_values: 1 avg_num_values: 1.0 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } type: QUANTILES } tot_num_values: 1 } unique: 1 top_values { value: "1" frequency: 1.0 } avg_length: 1.0 rank_histogram { buckets { label: "1" sample_count: 1.0 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) self._default_stats_options.schema = schema self._default_stats_options.infer_type_from_schema = True result = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, delimiter=',', stats_options=self._default_stats_options) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])