def test_csv_decoder_with_schema(self): input_lines = ['1,1,2.0,hello', '5,5,12.34,world'] column_names = [ 'int_feature_parsed_as_float', 'int_feature', 'float_feature', 'str_feature' ] schema = text_format.Parse( """ feature { name: "int_feature_parsed_as_float" type: FLOAT } feature { name: "int_feature" type: INT } feature { name: "float_feature" type: FLOAT } feature { name: "str_feature" type: BYTES } """, schema_pb2.Schema()) expected_result = [ pa.Table.from_arrays([ pa.array([[1], [5]], pa.list_(pa.float32())), pa.array([[1], [5]], pa.list_(pa.int64())), pa.array([[2.0], [12.34]], pa.list_(pa.float32())), pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())), ], [ 'int_feature_parsed_as_float', 'int_feature', 'float_feature', 'str_feature' ]) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names, schema=schema, infer_type_from_schema=True)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_batch_examples(self): examples = [{ 'a': np.array([1.0, 2.0], dtype=np.float32), 'b': np.array(['a', 'b', 'c', 'e']) }, { 'a': np.array([3.0, 4.0, 5.0], dtype=np.float32), }, { 'b': np.array(['d', 'e', 'f']), 'd': np.array([10, 20, 30], dtype=np.int64), }, { 'b': np.array(['a', 'b', 'c']) }, { 'c': np.array(['d', 'e', 'f']) }] expected_tables = [ pa.Table.from_arrays([ pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]], type=pa.list_(pa.float32())), pa.array([['a', 'b', 'c', 'e'], None]) ], ['a', 'b']), pa.Table.from_arrays([ pa.array([['d', 'e', 'f'], ['a', 'b', 'c']]), pa.array([[10, 20, 30], None], type=pa.list_(pa.int64())) ], ['b', 'd']), pa.Table.from_arrays([pa.array([['d', 'e', 'f']])], ['c']), ] with beam.Pipeline() as p: result = ( p | beam.Create(examples) | batch_util.BatchExamplesToArrowTables(desired_batch_size=2)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_tables))
def test_csv_decoder_with_int_and_float_in_same_column(self): input_lines = ['2,1.5', '1.5,2'] column_names = ['float_feature1', 'float_feature2'] expected_result = [ pa.Table.from_arrays([ pa.array([[2.0], [1.5]], pa.list_(pa.float32())), pa.array([[1.5], [2.0]], pa.list_(pa.float32())), ], ['float_feature1', 'float_feature2']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_with_float_and_string_in_same_column(self): input_lines = ['2.3,abc', 'abc,2.3'] column_names = ['str_feature1', 'str_feature2'] expected_result = [ pa.Table.from_arrays([ pa.array([[b'2.3'], [b'abc']], pa.list_(pa.binary())), pa.array([[b'abc'], [b'2.3']], pa.list_(pa.binary())), ], ['str_feature1', 'str_feature2']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_with_tab_delimiter(self): input_lines = ['1\t"this is a \ttext"', '5\t'] column_names = ['int_feature', 'str_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[1], [5]], pa.list_(pa.int64())), pa.array([[b'this is a \ttext'], None], pa.list_(pa.binary())), ], ['int_feature', 'str_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV( column_names=column_names, delimiter='\t')) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_skip_blank_line(self): input_lines = ['', '1,2'] column_names = ['int_feature1', 'int_feature2'] expected_result = [ pa.Table.from_arrays([ pa.array([[1]], pa.list_(pa.int64())), pa.array([[2]], pa.list_(pa.int64())), ], ['int_feature1', 'int_feature2']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_consider_blank_line(self): input_lines = ['', '1,2.0'] column_names = ['int_feature', 'float_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([None, [1]], pa.list_(pa.int64())), pa.array([None, [2.0]], pa.list_(pa.float32())), ], ['int_feature', 'float_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV( column_names=column_names, skip_blank_lines=False)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_empty_row(self): input_lines = [',,', '1,2.0,hello'] column_names = ['int_feature', 'float_feature', 'str_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([None, [1.0]], pa.list_(pa.float32())), pa.array([None, [2.0]], pa.list_(pa.float32())), pa.array([None, [b'hello']], pa.list_(pa.binary())), ], ['int_feature', 'float_feature', 'str_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_get_flattened_array_parent_indices(self): indices = arrow_util.GetFlattenedArrayParentIndices( pa.array([], type=pa.list_(pa.int32()))) self.assertTrue(indices.equals(pa.array([], type=pa.int32()))) indices = arrow_util.GetFlattenedArrayParentIndices( pa.array([[1.], [2.], [], [3.]])) self.assertTrue(indices.equals(pa.array([0, 1, 3], type=pa.int32())))
def test_csv_decoder(self): input_lines = ['1,2.0,hello', '5,12.34,world'] column_names = ['int_feature', 'float_feature', 'str_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[1], [5]], pa.list_(pa.int64())), pa.array([[2.0], [12.34]], pa.list_(pa.float32())), pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())), ], ['int_feature', 'float_feature', 'str_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines, reshuffle=False) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_with_space_delimiter(self): input_lines = ['1 "ab,cd,ef"', '5 "wx,xy,yz"'] column_names = ['int_feature', 'str_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[1], [5]], pa.list_(pa.int64())), pa.array([[b'ab,cd,ef'], [b'wx,xy,yz']], pa.list_( pa.binary())), ], ['int_feature', 'str_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV( column_names=column_names, delimiter=' ')) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_flatten_list_array(self): flattened = arrow_util.FlattenListArray( pa.array([], type=pa.list_(pa.int64()))) self.assertTrue(flattened.equals(pa.array([], type=pa.int64()))) flattened = arrow_util.FlattenListArray( pa.array([[1.], [2.], [], [3.]])) self.assertTrue(flattened.equals(pa.array([1., 2., 3.])))
def test_csv_decoder_missing_values(self): input_lines = ['1,,hello', ',12.34,'] column_names = ['int_feature', 'float_feature', 'str_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[1], None], pa.list_(pa.int64())), pa.array([None, [12.34]], pa.list_(pa.float32())), pa.array([[b'hello'], None], pa.list_(pa.binary())), ], ['int_feature', 'float_feature', 'str_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_basic_stats_generator_invalid_value_numpy_dtype(self): batches = [ pa.Table.from_arrays([pa.array([[]], type=pa.list_(pa.date32()))], ['a']) ] generator = basic_stats_generator.BasicStatsGenerator() with self.assertRaisesRegexp(TypeError, 'Feature a has unsupported arrow type'): self.assertCombinerOutputEqual(batches, generator, None)
def _process_column_infos(self, column_infos: List[csv_decoder.ColumnInfo]): column_handlers = [] column_arrow_types = [] for c in column_infos: if c.type == statistics_pb2.FeatureNameStatistics.INT: column_handlers.append(lambda v: (int(v),)) column_arrow_types.append(pa.list_(pa.int64())) elif c.type == statistics_pb2.FeatureNameStatistics.FLOAT: column_handlers.append(lambda v: (float(v),)) column_arrow_types.append(pa.list_(pa.float32())) elif c.type == statistics_pb2.FeatureNameStatistics.STRING: column_handlers.append(lambda v: (v,)) column_arrow_types.append(pa.list_(pa.binary())) else: column_handlers.append(lambda _: None) column_arrow_types.append(pa.null()) self._column_handlers = column_handlers self._column_arrow_types = column_arrow_types self._column_names = [c.name for c in column_infos]
def test_list_lengths(self): list_lengths = arrow_util.ListLengthsFromListArray( pa.array([], type=pa.list_(pa.int64()))) self.assertTrue(list_lengths.equals(pa.array([], type=pa.int32()))) list_lengths = arrow_util.ListLengthsFromListArray( pa.array([[1., 2.], [], [3.]])) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int32()))) list_lengths = arrow_util.ListLengthsFromListArray( pa.array([[1., 2.], None, [3.]])) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int32())))
def test_topk_uniques_combiner_zero_row(self): batches = [ pa.Table.from_arrays([pa.array([], type=pa.list_(pa.binary()))], ['f1']) ] expected_result = {} generator = (top_k_uniques_combiner_stats_generator. TopKUniquesCombinerStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3)) self.assertCombinerOutputEqual(batches, generator, expected_result)
def _to_topk_tuples(sliced_table, categorical_features, weight_feature=None): """Generates tuples for computing top-k and uniques from input tables.""" slice_key, table = sliced_table weight_column = table.column(weight_feature) if weight_feature else None weight_array = weight_column.data.chunk(0) if weight_column else [] if weight_array: flattened_weights = arrow_util.FlattenListArray( weight_array).to_numpy() for feature_column in table.columns: feature_name = feature_column.name # Skip the weight feature. if feature_name == weight_feature: continue feature_path = types.FeaturePath([feature_name]) # if it's not a categorical feature nor a string feature, we don't bother # with topk stats. if not (feature_path in categorical_features or feature_column.type.equals(pa.list_(pa.binary())) or feature_column.type.equals(pa.list_(pa.string()))): continue value_array = feature_column.data.chunk(0) flattened_values = arrow_util.FlattenListArray(value_array) if weight_array and flattened_values: if (pa.types.is_binary(flattened_values.type) or pa.types.is_string(flattened_values.type)): # no free conversion. flattened_values_np = flattened_values.to_pandas() else: flattened_values_np = flattened_values.to_numpy() indices = arrow_util.GetFlattenedArrayParentIndices(value_array) weights_ndarray = flattened_weights[indices.to_numpy()] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = arrow_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def test_basic_stats_generator_no_value_in_batch(self): batches = [ pa.Table.from_arrays( [pa.array([[], [], []], type=pa.list_(pa.int64()))], ['a']) ] expected_result = { types.FeaturePath(['a']): text_format.Parse( """ path { step: 'a' } num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } type: QUANTILES } } }""", statistics_pb2.FeatureNameStatistics()) } generator = basic_stats_generator.BasicStatsGenerator() self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_basic_stats_generator_only_nan(self): b1 = pa.Table.from_arrays( [pa.array([[np.NaN]], type=pa.list_(pa.float32()))], ['a']) batches = [b1] expected_result = { types.FeaturePath(['a']): text_format.Parse( """ path { step: 'a' } type: FLOAT num_stats { common_stats { num_non_missing: 1 min_num_values: 1 max_num_values: 1 avg_num_values: 1.0 tot_num_values: 1 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } type: QUANTILES } } histograms { num_nan: 1 type: STANDARD } histograms { num_nan: 1 type: QUANTILES } } """, statistics_pb2.FeatureNameStatistics()) } generator = basic_stats_generator.BasicStatsGenerator( num_values_histogram_buckets=2, num_histogram_buckets=3, num_quantiles_histogram_buckets=4) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_csv_decoder_negative_values(self): input_lines = ['-34', '45'] column_names = ['feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[-34], [45]], pa.list_(pa.int64())), ], ['feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_with_unicode(self): input_lines = [u'1,שקרכלשהו,22.34,text field'] column_names = [ 'int_feature', 'unicode_feature', 'float_feature', 'str_feature' ] expected_result = [ pa.Table.from_arrays([ pa.array([[1]], pa.list_(pa.int64())), pa.array([[22.34]], pa.list_(pa.float32())), pa.array([[u'שקרכלשהו'.encode('utf-8')]], pa.list_( pa.binary())), pa.array([[b'text field']], pa.list_(pa.binary())), ], [ 'int_feature', 'float_feature', 'unicode_feature', 'str_feature' ]) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_int64_max(self): input_lines = ['34', str(sys.maxsize)] column_names = ['feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[34], [sys.maxsize]], pa.list_(pa.int64())), ], ['feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_large_int_categorical_neg(self): input_lines = ['34', str(-(sys.maxsize + 2))] column_names = ['feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[b'34'], [str(-(sys.maxsize + 2)).encode('utf-8')]], pa.list_(pa.binary())), ], ['feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_basic_stats_generator_empty_batch(self): batches = [ pa.Table.from_arrays([pa.array([], type=pa.list_(pa.binary()))], ['a']) ] expected_result = { types.FeaturePath(['a']): text_format.Parse( """ path { step: 'a' } type: STRING string_stats { common_stats { num_non_missing: 0 tot_num_values: 0 } } """, statistics_pb2.FeatureNameStatistics()) } generator = basic_stats_generator.BasicStatsGenerator() self.assertCombinerOutputEqual(batches, generator, expected_result)
testcase_name="num_parents_too_small", num_parents=1, parent_indices=np.array([1], dtype=np.int64), values=pa.array([1 ]), expected_error=RuntimeError, expected_error_regexp="Found a parent index 1 while num_parents was 1") ] _MAKE_LIST_ARRAY_TEST_CASES = [ dict(testcase_name="parents_are_all_empty", num_parents=5, parent_indices=np.array([], dtype=np.int64), values=pa.array([], type=pa.int64()), expected=pa.array([None, None, None, None, None], type=pa.list_(pa.int64()))), dict(testcase_name="long_num_parent", num_parents=(long(1) if six.PY2 else 1), parent_indices=np.array([0], dtype=np.int64), values=pa.array([1]), expected=pa.array([[1]])), dict( testcase_name="leading nones", num_parents=3, parent_indices=np.array([2], dtype=np.int64), values=pa.array([1]), expected=pa.array([None, None, [1]]), ), dict(testcase_name="same_parent_and_holes", num_parents=4, parent_indices=np.array([0, 0, 0, 3, 3], dtype=np.int64),
value { float_list { value: [ 4.0 ] } } } feature { key: "float_feature_2" value { float_list { value: [ 5.0, 6.0 ] } } } feature { key: "str_feature_1" value { bytes_list { value: [ 'female' ] } } } feature { key: "str_feature_2" value { bytes_list { value: [ 'string', 'list' ] } } } } ''', 'decoded_table': pa.Table.from_arrays([ pa.array([[0]], pa.list_(pa.int64())), pa.array([[1, 2, 3]], pa.list_(pa.int64())), pa.array([[4.0]], pa.list_(pa.float32())), pa.array([[5.0, 6.0]], pa.list_(pa.float32())), pa.array([[b'female']], pa.list_(pa.binary())), pa.array([[b'string', b'list']], pa.list_(pa.binary())) ], [ 'int_feature_1', 'int_feature_2', 'float_feature_1', 'float_feature_2', 'str_feature_1', 'str_feature_2' ]) }, ]
def test_topk_uniques_combiner_with_numeric_feature(self): # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' batches = [ pa.Table.from_arrays([ pa.array([['a', 'b', 'c', 'e'], None, ['a', 'c', 'd']]), pa.array([[1.0, 2.0, 3.0], [4.0, 5.0], None]), ], ['fa', 'fb']), pa.Table.from_arrays([ pa.array([['a', 'a', 'b', 'c', 'd']]), pa.array([None], type=pa.list_(pa.float32())), ], ['fa', 'fb']), ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } type: STRING string_stats { unique: 5 top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } }""", statistics_pb2.FeatureNameStatistics()) } generator = (top_k_uniques_combiner_stats_generator. TopKUniquesCombinerStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3)) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_uniques_combiner_with_single_bytes_feature(self): # 'fa': 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' batches = [ pa.Table.from_arrays([ pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']], type=pa.list_(pa.binary())) ], ['fa']), pa.Table.from_arrays( [pa.array([['a', 'b', 'c', 'd']], type=pa.list_(pa.binary()))], ['fa']) ] # Note that if two feature values have the same frequency, the one with the # lexicographically larger feature value will be higher in the order. expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } type: STRING string_stats { unique: 5 top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } }""", statistics_pb2.FeatureNameStatistics()) } generator = (top_k_uniques_combiner_stats_generator. TopKUniquesCombinerStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3)) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_batch_serialized_examples(self): examples = [ """ features { feature { key: "a" value { float_list { value: [ 1.0, 2.0 ] } } } feature { key: "b" value { bytes_list { value: [ 'a', 'b', 'c', 'e' ] } } } }""", """ features { feature { key: "a" value { float_list { value: [ 3.0, 4.0, 5.0 ] } } } }""", """ features { feature { key: "b" value { bytes_list { value: [ 'd', 'e', 'f' ] } } } feature { key: "d" value { int64_list { value: [ 10, 20, 30 ] } } } }""", """ features { feature { key: "b" value { bytes_list { value: [ 'a', 'b', 'c' ] } } } }""", """ features { feature { key: "c" value { bytes_list { value: [ 'd', 'e', 'f' ] } } } }""", ] serialized_examples = [ text_format.Merge(example_pbtxt, tf.train.Example()).SerializeToString() for example_pbtxt in examples ] expected_tables = [ pa.Table.from_arrays([ pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]], type=pa.list_(pa.float32())), pa.array([['a', 'b', 'c', 'e'], None], type=pa.list_(pa.binary())) ], ['a', 'b']), pa.Table.from_arrays([ pa.array([['d', 'e', 'f'], ['a', 'b', 'c']], type=pa.list_(pa.binary())), pa.array([[10, 20, 30], None], type=pa.list_(pa.int64())) ], ['b', 'd']), pa.Table.from_arrays( [pa.array([['d', 'e', 'f']], type=pa.list_(pa.binary()))], ['c']), ] with beam.Pipeline() as p: result = (p | beam.Create(serialized_examples) | batch_util.BatchSerializedExamplesToArrowTables( desired_batch_size=2)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_tables))