def test_csv_decoder_consider_blank_line_single_column(self):
        input_lines = ['', '1']
        column_names = ['int_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([None, [1]], pa.list_(pa.int64())),
            ], ['int_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines, reshuffle=False)
                      | csv_decoder.DecodeCSV(column_names=column_names,
                                              skip_blank_lines=False))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
示例#2
0
    def test_csv_decoder_with_tab_delimiter(self):
        input_lines = ['1\t"this is a \ttext"', '5\t']
        column_names = ['int_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[b'this is a \ttext'], None], pa.list_(pa.binary())),
            ], ['int_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, delimiter='\t'))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
示例#3
0
    def test_csv_decoder_missing_values(self):
        input_lines = ['1,,hello', ',12.34,']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], None], pa.list_(pa.int64())),
                pa.array([None, [12.34]], pa.list_(pa.float32())),
                pa.array([[b'hello'], None], pa.list_(pa.binary())),
            ], ['int_feature', 'float_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
示例#4
0
    def test_csv_decoder_with_space_delimiter(self):
        input_lines = ['1 "ab,cd,ef"', '5 "wx,xy,yz"']
        column_names = ['int_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[b'ab,cd,ef'], [b'wx,xy,yz']], pa.list_(
                    pa.binary())),
            ], ['int_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, delimiter=' '))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
    def test_csv_decoder(self):
        input_lines = ['1,2.0,hello', '5,12.34,world']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[2.0], [12.34]], pa.list_(pa.float32())),
                pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())),
            ], ['int_feature', 'float_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines, reshuffle=False)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
示例#6
0
 def _process_column_infos(self, column_infos: List[csv_decoder.ColumnInfo]):
   column_handlers = []
   column_arrow_types = []
   for c in column_infos:
     if c.type == statistics_pb2.FeatureNameStatistics.INT:
       column_handlers.append(lambda v: (int(v),))
       column_arrow_types.append(pa.list_(pa.int64()))
     elif c.type == statistics_pb2.FeatureNameStatistics.FLOAT:
       column_handlers.append(lambda v: (float(v),))
       column_arrow_types.append(pa.list_(pa.float32()))
     elif c.type == statistics_pb2.FeatureNameStatistics.STRING:
       column_handlers.append(lambda v: (v,))
       column_arrow_types.append(pa.list_(pa.binary()))
     else:
       column_handlers.append(lambda _: None)
       column_arrow_types.append(pa.null())
   self._column_handlers = column_handlers
   self._column_arrow_types = column_arrow_types
   self._column_names = [c.name for c in column_infos]
示例#7
0
    def test_csv_decoder_with_unicode(self):
        input_lines = [u'1,שקרכלשהו,22.34,text field']
        column_names = [
            'int_feature', 'unicode_feature', 'float_feature', 'str_feature'
        ]
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1]], pa.list_(pa.int64())),
                pa.array([[22.34]], pa.list_(pa.float32())),
                pa.array([[u'שקרכלשהו'.encode('utf-8')]], pa.list_(
                    pa.binary())),
                pa.array([[b'text field']], pa.list_(pa.binary())),
            ], [
                'int_feature', 'float_feature', 'unicode_feature',
                'str_feature'
            ])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
         "values array and parent indices array must be of the same length"),
    dict(
        testcase_name="num_parents_too_small",
        num_parents=1,
        parent_indices=np.array([1], dtype=np.int64),
        values=pa.array([1
                         ]),
        expected_error=RuntimeError,
        expected_error_regexp="Found a parent index 1 while num_parents was 1")
]

_MAKE_LIST_ARRAY_TEST_CASES = [
    dict(testcase_name="parents_are_all_empty",
         num_parents=5,
         parent_indices=np.array([], dtype=np.int64),
         values=pa.array([], type=pa.int64()),
         expected=pa.array([None, None, None, None, None],
                           type=pa.list_(pa.int64()))),
    dict(testcase_name="long_num_parent",
         num_parents=(long(1) if six.PY2 else 1),
         parent_indices=np.array([0], dtype=np.int64),
         values=pa.array([1]),
         expected=pa.array([[1]])),
    dict(
        testcase_name="leading nones",
        num_parents=3,
        parent_indices=np.array([2], dtype=np.int64),
        values=pa.array([1]),
        expected=pa.array([None, None, [1]]),
    ),
    dict(testcase_name="same_parent_and_holes",
示例#9
0
              value { float_list { value: [ 4.0 ] } }
            }
            feature {
              key: "float_feature_2"
              value { float_list { value: [ 5.0, 6.0 ] } }
            }
            feature {
              key: "str_feature_1"
              value { bytes_list { value: [ 'female' ] } }
            }
            feature {
              key: "str_feature_2"
              value { bytes_list { value: [ 'string', 'list' ] } }
            }
          }
        ''',
        'decoded_table':
        pa.Table.from_arrays([
            pa.array([[0]], pa.list_(pa.int64())),
            pa.array([[1, 2, 3]], pa.list_(pa.int64())),
            pa.array([[4.0]], pa.list_(pa.float32())),
            pa.array([[5.0, 6.0]], pa.list_(pa.float32())),
            pa.array([[b'female']], pa.list_(pa.binary())),
            pa.array([[b'string', b'list']], pa.list_(pa.binary()))
        ], [
            'int_feature_1', 'int_feature_2', 'float_feature_1',
            'float_feature_2', 'str_feature_1', 'str_feature_2'
        ])
    },
]
示例#10
0
    def test_batch_serialized_examples(self):
        examples = [
            """
        features {
          feature {
            key: "a"
            value { float_list { value: [ 1.0, 2.0 ] } }
          }
          feature {
            key: "b"
            value { bytes_list { value: [ 'a', 'b', 'c', 'e' ] } }
          }
        }""",
            """
        features {
          feature {
            key: "a"
            value { float_list { value: [ 3.0, 4.0, 5.0 ] } }
          }
        }""",
            """
        features {
          feature {
            key: "b"
            value { bytes_list { value: [ 'd', 'e', 'f' ] } }
          }
          feature {
            key: "d"
            value { int64_list { value: [ 10, 20, 30 ] } }
          }
        }""",
            """
        features {
          feature {
            key: "b"
            value { bytes_list { value: [ 'a', 'b', 'c' ] } }
          }
        }""",
            """
        features {
          feature {
            key: "c"
            value { bytes_list { value: [ 'd', 'e', 'f' ] } }
          }
        }""",
        ]
        serialized_examples = [
            text_format.Merge(example_pbtxt,
                              tf.train.Example()).SerializeToString()
            for example_pbtxt in examples
        ]
        expected_tables = [
            pa.Table.from_arrays([
                pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]],
                         type=pa.list_(pa.float32())),
                pa.array([['a', 'b', 'c', 'e'], None],
                         type=pa.list_(pa.binary()))
            ], ['a', 'b']),
            pa.Table.from_arrays([
                pa.array([['d', 'e', 'f'], ['a', 'b', 'c']],
                         type=pa.list_(pa.binary())),
                pa.array([[10, 20, 30], None], type=pa.list_(pa.int64()))
            ], ['b', 'd']),
            pa.Table.from_arrays(
                [pa.array([['d', 'e', 'f']], type=pa.list_(pa.binary()))],
                ['c']),
        ]

        with beam.Pipeline() as p:
            result = (p
                      | beam.Create(serialized_examples)
                      | batch_util.BatchSerializedExamplesToArrowTables(
                          desired_batch_size=2))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_tables))
示例#11
0
         expected_error_regexp="Expected dict key of type str or bytes"),
    dict(testcase_name="unsupported_ndarray_type",
         test_input=[{
             "a": np.array([1j, 2j, 3j], dtype=np.complex64)
         }],
         expected_error=RuntimeError,
         expected_error_regexp="Unsupported numpy type"),
]

_CONVERSION_TEST_CASES = [
    dict(testcase_name="unicode_feature_name",
         input_examples=[{
             u"\U0001f951": np.array([1, 2, 3], dtype=np.int64),
         }],
         expected_output={
             u"\U0001f951": pa.array([[1, 2, 3]], type=pa.list_(pa.int64())),
         }),
    dict(testcase_name="supported_ndarray_types",
         input_examples=[
             {
                 "int64_feature": np.array([1, 2, 3], dtype=np.int64),
                 "uint64_feature": np.array([1, 2, 3], dtype=np.uint64),
                 "int32_feature": np.array([1, 2, 3], dtype=np.int32),
                 "uint32_feature": np.array([1, 2, 3], dtype=np.uint32),
                 "float_feature": np.array([1.], dtype=np.float32),
                 "double_feature": np.array([1.], dtype=np.float64),
                 "bytes_feature": np.array([b"abc", b"def"], dtype=np.object),
                 "unicode_feature": np.array([u"abc", u"def"],
                                             dtype=np.object),
             },
             {
 def test_basic_stats_generator_handle_null_column(self):
     # Feature 'a' covers null coming before non-null.
     # Feature 'b' covers null coming after non-null.
     b1 = pa.Table.from_arrays([
         pa.array([None, None, None], type=pa.null()),
         pa.array([[1.0, 2.0, 3.0], [4.0], [5.0]]),
     ], ['a', 'b'])
     b2 = pa.Table.from_arrays([
         pa.array([[1, 2], None], type=pa.list_(pa.int64())),
         pa.array([None, None], type=pa.null()),
     ], ['a', 'b'])
     batches = [b1, b2]
     expected_result = {
         types.FeaturePath(['a']):
         text_format.Parse(
             """
         path {
           step: "a"
         }
         num_stats {
           common_stats {
             num_non_missing: 1
             min_num_values: 2
             max_num_values: 2
             avg_num_values: 2.0
             num_values_histogram {
               buckets {
                 low_value: 2.0
                 high_value: 2.0
                 sample_count: 0.25
               }
               buckets {
                 low_value: 2.0
                 high_value: 2.0
                 sample_count: 0.25
               }
               buckets {
                 low_value: 2.0
                 high_value: 2.0
                 sample_count: 0.25
               }
               buckets {
                 low_value: 2.0
                 high_value: 2.0
                 sample_count: 0.25
               }
               type: QUANTILES
             }
             tot_num_values: 2
           }
           mean: 1.5
           std_dev: 0.5
           min: 1.0
           median: 2.0
           max: 2.0
           histograms {
             buckets {
               low_value: 1.0
               high_value: 1.3333333
               sample_count: 0.9955556
             }
             buckets {
               low_value: 1.3333333
               high_value: 1.6666667
               sample_count: 0.0022222
             }
             buckets {
               low_value: 1.6666667
               high_value: 2.0
               sample_count: 1.0022222
             }
           }
           histograms {
             buckets {
               low_value: 1.0
               high_value: 1.0
               sample_count: 0.5
             }
             buckets {
               low_value: 1.0
               high_value: 2.0
               sample_count: 0.5
             }
             buckets {
               low_value: 2.0
               high_value: 2.0
               sample_count: 0.5
             }
             buckets {
               low_value: 2.0
               high_value: 2.0
               sample_count: 0.5
             }
             type: QUANTILES
           }
         }
         """, statistics_pb2.FeatureNameStatistics()),
         types.FeaturePath(['b']):
         text_format.Parse(
             """
         path {
           step: 'b'
         }
         type: FLOAT
         num_stats {
           common_stats {
             num_non_missing: 3
             min_num_values: 1
             max_num_values: 3
             avg_num_values: 1.66666698456
             num_values_histogram {
               buckets {
                 low_value: 1.0
                 high_value: 1.0
                 sample_count: 0.75
               }
               buckets {
                 low_value: 1.0
                 high_value: 1.0
                 sample_count: 0.75
               }
               buckets {
                 low_value: 1.0
                 high_value: 3.0
                 sample_count: 0.75
               }
               buckets {
                 low_value: 3.0
                 high_value: 3.0
                 sample_count: 0.75
               }
               type: QUANTILES
             }
             tot_num_values: 5
           }
           mean: 3.0
           std_dev: 1.4142136
           min: 1.0
           median: 3.0
           max: 5.0
           histograms {
             buckets {
               low_value: 1.0
               high_value: 2.3333333
               sample_count: 1.9888889
             }
             buckets {
               low_value: 2.3333333
               high_value: 3.6666667
               sample_count: 1.0055556
             }
             buckets {
               low_value: 3.6666667
               high_value: 5.0
               sample_count: 2.0055556
             }
           }
           histograms {
             buckets {
               low_value: 1.0
               high_value: 2.0
               sample_count: 1.25
             }
             buckets {
               low_value: 2.0
               high_value: 3.0
               sample_count: 1.25
             }
             buckets {
               low_value: 3.0
               high_value: 4.0
               sample_count: 1.25
             }
             buckets {
               low_value: 4.0
               high_value: 5.0
               sample_count: 1.25
             }
             type: QUANTILES
           }
         }
         """, statistics_pb2.FeatureNameStatistics()),
     }
     generator = basic_stats_generator.BasicStatsGenerator(
         num_values_histogram_buckets=4,
         num_histogram_buckets=3,
         num_quantiles_histogram_buckets=4)
     self.assertCombinerOutputEqual(batches, generator, expected_result)