def test_csv_decoder_with_schema(self): input_lines = ['1,1,2.0,hello', '5,5,12.34,world'] column_names = ['int_feature_parsed_as_float', 'int_feature', 'float_feature', 'str_feature'] schema = text_format.Parse( """ feature { name: "int_feature_parsed_as_float" type: FLOAT } feature { name: "int_feature" type: INT } feature { name: "float_feature" type: FLOAT } feature { name: "str_feature" type: BYTES } """, schema_pb2.Schema()) expected_result = [ {'int_feature_parsed_as_float': np.array([1], dtype=np.float32), 'int_feature': np.array([1], dtype=np.int64), 'float_feature': np.array([2.0], dtype=np.float32), 'str_feature': np.array([b'hello'], dtype=np.object)}, {'int_feature_parsed_as_float': np.array([5], dtype=np.float32), 'int_feature': np.array([5], dtype=np.int64), 'float_feature': np.array([12.34], dtype=np.float32), 'str_feature': np.array([b'world'], dtype=np.object)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names, schema=schema, infer_type_from_schema=True)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_decode_example_with_beam_pipeline(self): example_proto_text = """ features { feature { key: "int_feature_1" value { int64_list { value: [ 0 ] } } } feature { key: "int_feature_2" value { int64_list { value: [ 1, 2, 3 ] } } } feature { key: "float_feature_1" value { float_list { value: [ 4.0 ] } } } feature { key: "float_feature_2" value { float_list { value: [ 5.0, 6.0 ] } } } feature { key: "str_feature_1" value { bytes_list { value: [ 'female' ] } } } feature { key: "str_feature_2" value { bytes_list { value: [ 'string', 'list' ] } } } } """ expected_decoded = { 'int_feature_1': np.array([0], dtype=np.integer), 'int_feature_2': np.array([1, 2, 3], dtype=np.integer), 'float_feature_1': np.array([4.0], dtype=np.floating), 'float_feature_2': np.array([5.0, 6.0], dtype=np.floating), 'str_feature_1': np.array([b'female'], dtype=np.object), 'str_feature_2': np.array([b'string', b'list'], dtype=np.object), } example = tf.train.Example() text_format.Merge(example_proto_text, example) with beam.Pipeline() as p: result = (p | beam.Create([example.SerializeToString()]) | tf_example_decoder.DecodeTFExample()) util.assert_that( result, test_util.make_example_dict_equal_fn(self, [expected_decoded]))
def test_csv_decoder_empty_csv(self): input_lines = [] expected_result = [] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=[])) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_decode_example_with_beam_pipeline(self, example_proto_text, decoded_example): example = tf.train.Example() text_format.Merge(example_proto_text, example) with beam.Pipeline() as p: result = (p | beam.Create([example.SerializeToString()]) | tf_example_decoder.DecodeTFExample()) util.assert_that( result, test_util.make_example_dict_equal_fn(self, [decoded_example]))
def test_csv_decoder_invalid_row(self): input_lines = ['1,2.0,hello', '5,12.34'] column_names = ['int_feature', 'float_feature', 'str_feature'] with self.assertRaisesRegexp( ValueError, '.*Columns do not match specified csv headers.*'): with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, None))
def test_csv_decoder_skip_blank_line_single_column(self): input_lines = ['', '1'] column_names = ['int_feature'] expected_result = [{'int_feature': np.array([1], dtype=np.integer)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_int64_max(self): input_lines = ['34', str(sys.maxsize)] column_names = ['feature'] expected_result = [ {'feature': np.array([34], dtype=np.int64)}, {'feature': np.array([sys.maxsize], dtype=np.int64)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_negative_values(self): input_lines = ['-34', '45'] column_names = ['feature'] expected_result = [ {'feature': np.array([-34], dtype=np.int64)}, {'feature': np.array([45], dtype=np.int64)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_consider_blank_line_single_column(self): input_lines = ['', '1'] column_names = ['float_feature'] expected_result = [{ 'float_feature': None }, { 'float_feature': np.array([1.0], dtype=np.floating) }] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV( column_names=column_names, skip_blank_lines=False)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_with_int_and_float_in_same_column(self): input_lines = ['2,1.5', '1.5,2'] column_names = ['float_feature1', 'float_feature2'] expected_result = [ {'float_feature1': np.array([2.0], dtype=np.float32), 'float_feature2': np.array([1.5], dtype=np.float32)}, {'float_feature1': np.array([1.5], dtype=np.float32), 'float_feature2': np.array([2.0], dtype=np.float32)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_large_int_categorical_pos(self): input_lines = ['34', str(sys.maxsize + 1)] column_names = ['feature'] expected_result = [{ 'feature': np.array([b'34'], dtype=np.object) }, { 'feature': np.array([str(sys.maxsize + 1).encode('utf-8')], dtype=np.object) }] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_with_float_and_string_in_same_column(self): input_lines = ['2.3,abc', 'abc,2.3'] column_names = ['str_feature1', 'str_feature2'] expected_result = [ {'str_feature1': np.array([b'2.3'], dtype=np.object), 'str_feature2': np.array([b'abc'], dtype=np.object)}, {'str_feature1': np.array([b'abc'], dtype=np.object), 'str_feature2': np.array([b'2.3'], dtype=np.object)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_csv_record_with_quotes(self): input_lines = ['1,"ab,cd,ef"', '5,"wx,xy,yz"'] column_names = ['int_feature', 'str_feature'] expected_result = [ {'int_feature': np.array([1], dtype=np.int64), 'str_feature': np.array([b'ab,cd,ef'], dtype=np.object)}, {'int_feature': np.array([5], dtype=np.int64), 'str_feature': np.array([b'wx,xy,yz'], dtype=np.object)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_with_tab_delimiter(self): input_lines = ['1\t"this is a \ttext"', '5\t'] column_names = ['int_feature', 'str_feature'] expected_result = [ {'int_feature': np.array([1], dtype=np.int64), 'str_feature': np.array([b'this is a \ttext'], dtype=np.object)}, {'int_feature': np.array([5], dtype=np.int64), 'str_feature': None}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names, delimiter='\t')) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_with_unicode(self): input_lines = [u'1,שקרכלשהו,22.34,text field'] column_names = ['int_feature', 'unicode_feature', 'float_feature', 'str_feature'] expected_result = [ {'int_feature': np.array([1], dtype=np.int64), 'unicode_feature': np.array([u'שקרכלשהו'.encode('utf-8')], dtype=np.object), 'float_feature': np.array([22.34], dtype=np.float32), 'str_feature': np.array([b'text field'], dtype=np.object)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_missing_values(self): input_lines = ['1,,hello', ',12.34,'] column_names = ['int_feature', 'float_feature', 'str_feature'] expected_result = [ {'int_feature': np.array([1.0], dtype=np.float32), 'float_feature': None, 'str_feature': np.array([b'hello'], dtype=np.object)}, {'int_feature': None, 'float_feature': np.array([12.34], dtype=np.float32), 'str_feature': None}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_empty_row(self): input_lines = [',,', '1,2.0,hello'] column_names = ['int_feature', 'float_feature', 'str_feature'] expected_result = [{ 'int_feature': None, 'float_feature': None, 'str_feature': None }, { 'int_feature': np.array([1.0], dtype=np.floating), 'float_feature': np.array([2.0], dtype=np.floating), 'str_feature': np.array(['hello'], dtype=np.object) }] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder(self): input_lines = ['1,2.0,hello', '5,12.34,world'] column_names = ['int_feature', 'float_feature', 'str_feature'] expected_result = [{ 'int_feature': np.array([1], dtype=np.integer), 'float_feature': np.array([2.0], dtype=np.floating), 'str_feature': np.array([b'hello'], dtype=np.object) }, { 'int_feature': np.array([5], dtype=np.integer), 'float_feature': np.array([12.34], dtype=np.floating), 'str_feature': np.array([b'world'], dtype=np.object) }] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))