def test_csv_decoder_consider_blank_line_single_column(self): input_lines = ['', '1'] column_names = ['int_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([None, [1]], pa.list_(pa.int64())), ], ['int_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines, reshuffle=False) | csv_decoder.DecodeCSV(column_names=column_names, skip_blank_lines=False)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_with_tab_delimiter(self): input_lines = ['1\t"this is a \ttext"', '5\t'] column_names = ['int_feature', 'str_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[1], [5]], pa.list_(pa.int64())), pa.array([[b'this is a \ttext'], None], pa.list_(pa.binary())), ], ['int_feature', 'str_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV( column_names=column_names, delimiter='\t')) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_missing_values(self): input_lines = ['1,,hello', ',12.34,'] column_names = ['int_feature', 'float_feature', 'str_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[1], None], pa.list_(pa.int64())), pa.array([None, [12.34]], pa.list_(pa.float32())), pa.array([[b'hello'], None], pa.list_(pa.binary())), ], ['int_feature', 'float_feature', 'str_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_with_space_delimiter(self): input_lines = ['1 "ab,cd,ef"', '5 "wx,xy,yz"'] column_names = ['int_feature', 'str_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[1], [5]], pa.list_(pa.int64())), pa.array([[b'ab,cd,ef'], [b'wx,xy,yz']], pa.list_( pa.binary())), ], ['int_feature', 'str_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV( column_names=column_names, delimiter=' ')) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder(self): input_lines = ['1,2.0,hello', '5,12.34,world'] column_names = ['int_feature', 'float_feature', 'str_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[1], [5]], pa.list_(pa.int64())), pa.array([[2.0], [12.34]], pa.list_(pa.float32())), pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())), ], ['int_feature', 'float_feature', 'str_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines, reshuffle=False) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def _process_column_infos(self, column_infos: List[csv_decoder.ColumnInfo]): column_handlers = [] column_arrow_types = [] for c in column_infos: if c.type == statistics_pb2.FeatureNameStatistics.INT: column_handlers.append(lambda v: (int(v),)) column_arrow_types.append(pa.list_(pa.int64())) elif c.type == statistics_pb2.FeatureNameStatistics.FLOAT: column_handlers.append(lambda v: (float(v),)) column_arrow_types.append(pa.list_(pa.float32())) elif c.type == statistics_pb2.FeatureNameStatistics.STRING: column_handlers.append(lambda v: (v,)) column_arrow_types.append(pa.list_(pa.binary())) else: column_handlers.append(lambda _: None) column_arrow_types.append(pa.null()) self._column_handlers = column_handlers self._column_arrow_types = column_arrow_types self._column_names = [c.name for c in column_infos]
def test_csv_decoder_with_unicode(self): input_lines = [u'1,שקרכלשהו,22.34,text field'] column_names = [ 'int_feature', 'unicode_feature', 'float_feature', 'str_feature' ] expected_result = [ pa.Table.from_arrays([ pa.array([[1]], pa.list_(pa.int64())), pa.array([[22.34]], pa.list_(pa.float32())), pa.array([[u'שקרכלשהו'.encode('utf-8')]], pa.list_( pa.binary())), pa.array([[b'text field']], pa.list_(pa.binary())), ], [ 'int_feature', 'float_feature', 'unicode_feature', 'str_feature' ]) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
"values array and parent indices array must be of the same length"), dict( testcase_name="num_parents_too_small", num_parents=1, parent_indices=np.array([1], dtype=np.int64), values=pa.array([1 ]), expected_error=RuntimeError, expected_error_regexp="Found a parent index 1 while num_parents was 1") ] _MAKE_LIST_ARRAY_TEST_CASES = [ dict(testcase_name="parents_are_all_empty", num_parents=5, parent_indices=np.array([], dtype=np.int64), values=pa.array([], type=pa.int64()), expected=pa.array([None, None, None, None, None], type=pa.list_(pa.int64()))), dict(testcase_name="long_num_parent", num_parents=(long(1) if six.PY2 else 1), parent_indices=np.array([0], dtype=np.int64), values=pa.array([1]), expected=pa.array([[1]])), dict( testcase_name="leading nones", num_parents=3, parent_indices=np.array([2], dtype=np.int64), values=pa.array([1]), expected=pa.array([None, None, [1]]), ), dict(testcase_name="same_parent_and_holes",
value { float_list { value: [ 4.0 ] } } } feature { key: "float_feature_2" value { float_list { value: [ 5.0, 6.0 ] } } } feature { key: "str_feature_1" value { bytes_list { value: [ 'female' ] } } } feature { key: "str_feature_2" value { bytes_list { value: [ 'string', 'list' ] } } } } ''', 'decoded_table': pa.Table.from_arrays([ pa.array([[0]], pa.list_(pa.int64())), pa.array([[1, 2, 3]], pa.list_(pa.int64())), pa.array([[4.0]], pa.list_(pa.float32())), pa.array([[5.0, 6.0]], pa.list_(pa.float32())), pa.array([[b'female']], pa.list_(pa.binary())), pa.array([[b'string', b'list']], pa.list_(pa.binary())) ], [ 'int_feature_1', 'int_feature_2', 'float_feature_1', 'float_feature_2', 'str_feature_1', 'str_feature_2' ]) }, ]
def test_batch_serialized_examples(self): examples = [ """ features { feature { key: "a" value { float_list { value: [ 1.0, 2.0 ] } } } feature { key: "b" value { bytes_list { value: [ 'a', 'b', 'c', 'e' ] } } } }""", """ features { feature { key: "a" value { float_list { value: [ 3.0, 4.0, 5.0 ] } } } }""", """ features { feature { key: "b" value { bytes_list { value: [ 'd', 'e', 'f' ] } } } feature { key: "d" value { int64_list { value: [ 10, 20, 30 ] } } } }""", """ features { feature { key: "b" value { bytes_list { value: [ 'a', 'b', 'c' ] } } } }""", """ features { feature { key: "c" value { bytes_list { value: [ 'd', 'e', 'f' ] } } } }""", ] serialized_examples = [ text_format.Merge(example_pbtxt, tf.train.Example()).SerializeToString() for example_pbtxt in examples ] expected_tables = [ pa.Table.from_arrays([ pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]], type=pa.list_(pa.float32())), pa.array([['a', 'b', 'c', 'e'], None], type=pa.list_(pa.binary())) ], ['a', 'b']), pa.Table.from_arrays([ pa.array([['d', 'e', 'f'], ['a', 'b', 'c']], type=pa.list_(pa.binary())), pa.array([[10, 20, 30], None], type=pa.list_(pa.int64())) ], ['b', 'd']), pa.Table.from_arrays( [pa.array([['d', 'e', 'f']], type=pa.list_(pa.binary()))], ['c']), ] with beam.Pipeline() as p: result = (p | beam.Create(serialized_examples) | batch_util.BatchSerializedExamplesToArrowTables( desired_batch_size=2)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_tables))
expected_error_regexp="Expected dict key of type str or bytes"), dict(testcase_name="unsupported_ndarray_type", test_input=[{ "a": np.array([1j, 2j, 3j], dtype=np.complex64) }], expected_error=RuntimeError, expected_error_regexp="Unsupported numpy type"), ] _CONVERSION_TEST_CASES = [ dict(testcase_name="unicode_feature_name", input_examples=[{ u"\U0001f951": np.array([1, 2, 3], dtype=np.int64), }], expected_output={ u"\U0001f951": pa.array([[1, 2, 3]], type=pa.list_(pa.int64())), }), dict(testcase_name="supported_ndarray_types", input_examples=[ { "int64_feature": np.array([1, 2, 3], dtype=np.int64), "uint64_feature": np.array([1, 2, 3], dtype=np.uint64), "int32_feature": np.array([1, 2, 3], dtype=np.int32), "uint32_feature": np.array([1, 2, 3], dtype=np.uint32), "float_feature": np.array([1.], dtype=np.float32), "double_feature": np.array([1.], dtype=np.float64), "bytes_feature": np.array([b"abc", b"def"], dtype=np.object), "unicode_feature": np.array([u"abc", u"def"], dtype=np.object), }, {
def test_basic_stats_generator_handle_null_column(self): # Feature 'a' covers null coming before non-null. # Feature 'b' covers null coming after non-null. b1 = pa.Table.from_arrays([ pa.array([None, None, None], type=pa.null()), pa.array([[1.0, 2.0, 3.0], [4.0], [5.0]]), ], ['a', 'b']) b2 = pa.Table.from_arrays([ pa.array([[1, 2], None], type=pa.list_(pa.int64())), pa.array([None, None], type=pa.null()), ], ['a', 'b']) batches = [b1, b2] expected_result = { types.FeaturePath(['a']): text_format.Parse( """ path { step: "a" } num_stats { common_stats { num_non_missing: 1 min_num_values: 2 max_num_values: 2 avg_num_values: 2.0 num_values_histogram { buckets { low_value: 2.0 high_value: 2.0 sample_count: 0.25 } buckets { low_value: 2.0 high_value: 2.0 sample_count: 0.25 } buckets { low_value: 2.0 high_value: 2.0 sample_count: 0.25 } buckets { low_value: 2.0 high_value: 2.0 sample_count: 0.25 } type: QUANTILES } tot_num_values: 2 } mean: 1.5 std_dev: 0.5 min: 1.0 median: 2.0 max: 2.0 histograms { buckets { low_value: 1.0 high_value: 1.3333333 sample_count: 0.9955556 } buckets { low_value: 1.3333333 high_value: 1.6666667 sample_count: 0.0022222 } buckets { low_value: 1.6666667 high_value: 2.0 sample_count: 1.0022222 } } histograms { buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } buckets { low_value: 1.0 high_value: 2.0 sample_count: 0.5 } buckets { low_value: 2.0 high_value: 2.0 sample_count: 0.5 } buckets { low_value: 2.0 high_value: 2.0 sample_count: 0.5 } type: QUANTILES } } """, statistics_pb2.FeatureNameStatistics()), types.FeaturePath(['b']): text_format.Parse( """ path { step: 'b' } type: FLOAT num_stats { common_stats { num_non_missing: 3 min_num_values: 1 max_num_values: 3 avg_num_values: 1.66666698456 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.75 } buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.75 } buckets { low_value: 1.0 high_value: 3.0 sample_count: 0.75 } buckets { low_value: 3.0 high_value: 3.0 sample_count: 0.75 } type: QUANTILES } tot_num_values: 5 } mean: 3.0 std_dev: 1.4142136 min: 1.0 median: 3.0 max: 5.0 histograms { buckets { low_value: 1.0 high_value: 2.3333333 sample_count: 1.9888889 } buckets { low_value: 2.3333333 high_value: 3.6666667 sample_count: 1.0055556 } buckets { low_value: 3.6666667 high_value: 5.0 sample_count: 2.0055556 } } histograms { buckets { low_value: 1.0 high_value: 2.0 sample_count: 1.25 } buckets { low_value: 2.0 high_value: 3.0 sample_count: 1.25 } buckets { low_value: 3.0 high_value: 4.0 sample_count: 1.25 } buckets { low_value: 4.0 high_value: 5.0 sample_count: 1.25 } type: QUANTILES } } """, statistics_pb2.FeatureNameStatistics()), } generator = basic_stats_generator.BasicStatsGenerator( num_values_histogram_buckets=4, num_histogram_buckets=3, num_quantiles_histogram_buckets=4) self.assertCombinerOutputEqual(batches, generator, expected_result)