def test_decode_errors(self): input_schema = dataset_schema.from_feature_spec({ 'b': tf.FixedLenFeature(shape=[], dtype=tf.float32), 'a': tf.FixedLenFeature(shape=[], dtype=tf.string), }) coder = csv_coder.CsvCoder(column_names=['a', 'b'], schema=input_schema) # Test bad csv. with self.assertRaisesRegexp( csv_coder.DecodeError, '\'int\' object has no attribute \'encode\': 123'): coder.decode(123) # Test extra column. with self.assertRaisesRegexp( csv_coder.DecodeError, 'Columns do not match specified csv headers'): coder.decode('1,2,') # Test missing column. with self.assertRaisesRegexp( csv_coder.DecodeError, 'Columns do not match specified csv headers'): coder.decode('a_value') # Test empty row. with self.assertRaisesRegexp( csv_coder.DecodeError, 'Columns do not match specified csv headers'): coder.decode('')
def test_missing_data(self): coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) data = '12,,female,1,89.0,12.0' with self.assertRaisesRegexp(ValueError, 'expected a value on column "text1"'): coder.decode(data)
def test_bad_boolean_data(self): coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) data = '12,text value,categorical_value,1,89.0,12.0,0' with self.assertRaisesRegexp(ValueError, 'expected "True" or "False" as inputs'): coder.decode(data)
def test_csv_coder_with_unicode(self): data = u'12,"this is a ,text",Hello κόσμε,1,89.0,12.0,20' coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) # Python types. expected_decoded = { 'category1': [u'Hello κόσμε'.encode('utf-8')], 'numeric1': 12, 'numeric2': [89.0], 'numeric3': [20], 'text1': 'this is a ,text', 'y': ([1], [12.0]) } self._assert_encode_decode(coder, data, expected_decoded) # Numpy types. expected_decoded = { 'category1': np.array([u'Hello κόσμε'.encode('utf-8')]), 'numeric1': np.array(12), 'numeric2': np.array([89.0]), 'numeric3': np.array([20]), 'text1': np.array(['this is a ,text']), 'y': (np.array(1), np.array([12.0])) } self._assert_encode_decode(coder, data, expected_decoded)
def test_csv_coder(self): data = '12,"this is a ,text",categorical_value,1,89.0,12.0,20' coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) # Python types. expected_decoded = { 'category1': ['categorical_value'], 'numeric1': 12, 'numeric2': [89.0], 'numeric3': [20], 'text1': 'this is a ,text', 'y': ([1], [12.0]) } self._assert_encode_decode(coder, data, expected_decoded) # Numpy types. expected_decoded = { 'category1': np.array(['categorical_value']), 'numeric1': np.array(12), 'numeric2': np.array([89.0]), 'numeric3': np.array([20]), 'text1': np.array(['this is a ,text']), 'y': (np.array(1), np.array([12.0])) } self._assert_encode_decode(coder, data, expected_decoded)
def test_missing_data(self): coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) data = '12,,categorical_value,1,89.0,12.0,20' with self.assertRaisesRegexp(ValueError, 'expected a value on column \'text1\''): coder.decode(data)
def test_missing_numeric_data(self): coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) # The numbers are missing. data = ',"this is a ,text",female,1,89.0,12.0' with self.assertRaisesRegexp(ValueError, 'expected a value on column "numeric1"'): coder.decode(data)
def test_constructor_error(self, columns, feature_spec, error_msg, error_type=ValueError, **kwargs): schema = dataset_schema.from_feature_spec(feature_spec) with self.assertRaisesRegexp(error_type, error_msg): csv_coder.CsvCoder(columns, schema, **kwargs)
def testDecode(self): for csv_line, value, multivalent, feature_spec in ( self._ENCODE_DECODE_CASES + self._DECODE_ONLY_CASES): schema = dataset_schema.from_feature_spec({'x': feature_spec}) if isinstance(feature_spec, tf.SparseFeature): columns = [feature_spec.index_key, feature_spec.value_key] else: columns = 'x' if multivalent: coder = csv_coder.CsvCoder(columns, schema, secondary_delimiter='|', multivalent_columns=columns) else: coder = csv_coder.CsvCoder(columns, schema) np.testing.assert_equal( coder.decode(csv_line), {'x': value}, self._msg_for_decode_case(csv_line, feature_spec))
def testEncode(self): for csv_line, value, multivalent, feature_spec in self._ENCODE_DECODE_CASES: schema = dataset_schema.from_feature_spec({'x': feature_spec}) if isinstance(feature_spec, tf.SparseFeature): columns = [feature_spec.index_key, feature_spec.value_key] else: columns = 'x' if multivalent: coder = csv_coder.CsvCoder(columns, schema, secondary_delimiter='|', multivalent_columns=columns) else: coder = csv_coder.CsvCoder(columns, schema) self.assertEqual(coder.encode({'x': value}), csv_line, msg=self._msg_for_encode_case( value, feature_spec))
def test_decode_error(self, columns, feature_spec, csv_line, error_msg, error_type=ValueError, **kwargs): schema = schema_utils.schema_from_feature_spec(feature_spec) coder = csv_coder.CsvCoder(columns, schema, **kwargs) with self.assertRaisesRegexp(error_type, error_msg): coder.decode(csv_line)
def test_encode_error(self, columns, feature_spec, instance, error_msg, error_type=ValueError, **kwargs): schema = dataset_schema.from_feature_spec(feature_spec) coder = csv_coder.CsvCoder(columns, schema, **kwargs) with self.assertRaisesRegexp(error_type, error_msg): coder.encode(instance)
def test_fixed_length_missing_values(self): input_schema = dataset_schema.from_feature_spec({ 'b': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=-1), 'a': tf.FixedLenFeature(shape=[], dtype=tf.string, default_value=''), }) coder = csv_coder.CsvCoder(column_names=['a', 'b'], schema=input_schema) self.assertEqual(coder.decode('a_value,'), {'a': 'a_value', 'b': -1.0}) self.assertEqual(coder.decode(',1.0'), {'a': '', 'b': 1.0}) self.assertEqual(coder.decode(','), {'a': '', 'b': -1.0})
def testEncodeErrors(self): for value, error_type, error_msg, multivalent, feature_spec in ( self._ENCODE_ERROR_CASES): schema = dataset_schema.from_feature_spec({'x': feature_spec}) if isinstance(feature_spec, tf.SparseFeature): columns = [feature_spec.index_key, feature_spec.value_key] else: columns = 'x' with self.assertRaisesRegexp(error_type, error_msg, msg=self._msg_for_encode_case( value, feature_spec)): if multivalent: coder = csv_coder.CsvCoder(columns, schema, secondary_delimiter='|', multivalent_columns=columns) else: coder = csv_coder.CsvCoder(columns, schema) coder.encode({'x': value})
def test_data_types(self): # The numbers are strings. data = '"12","this is a ,text",female,"1","89.0","12.0"' coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) expected_decoded = { 'category1': ['female'], 'numeric1': 12, 'numeric2': [89.0], 'text1': 'this is a ,text', 'y': ([12.0], [1]) } self._assert_encode_not_equal_decode(coder, data, expected_decoded)
def test_fixed_length_missing_values_no_default(self): input_schema = dataset_schema.from_feature_spec({ 'b': tf.FixedLenFeature(shape=[], dtype=tf.float32), 'a': tf.FixedLenFeature(shape=[], dtype=tf.string), }) coder = csv_coder.CsvCoder(column_names=['a', 'b'], schema=input_schema) with self.assertRaisesRegexp(ValueError, 'expected a value on column "b"'): coder.decode('a_value,')
def test_bad_row(self): coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) # The data has a more columns than expected. data = '12,"this is a ,text",female,1,89.0,12.0,"oh no, I\'m an error"' with self.assertRaisesRegexp( Exception, 'Columns do not match specified csv headers'): coder.decode(data) # The data has a fewer columns than expected. data = '12,"this is a ,text",female"' with self.assertRaisesRegexp( Exception, 'Columns do not match specified csv headers'): coder.decode(data)
def test_var_length_missing_values(self): input_schema = dataset_schema.from_feature_spec({ 'b': tf.VarLenFeature(dtype=tf.float32), 'a': tf.VarLenFeature(dtype=tf.string), }) coder = csv_coder.CsvCoder(column_names=['a', 'b'], schema=input_schema) self.assertEqual(coder.decode('a_value,'), {'a': ['a_value'], 'b': []}) self.assertEqual(coder.decode(',0'), {'a': [], 'b': [0.0]}) self.assertEqual(coder.decode(',1.0'), {'a': [], 'b': [1.0]}) self.assertEqual(coder.decode(','), {'a': [], 'b': []})
def test_tsv_coder(self): data = '12\t"this is a \ttext"\tfemale\t1\t89.0\t12.0' coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA, delimiter='\t') expected_decoded = { 'category1': ['female'], 'numeric1': 12, 'numeric2': [89.0], 'text1': 'this is a \ttext', 'y': ([12.0], [1]) } self._assert_encode_decode(coder, data, expected_decoded)
def testDecodeErrors(self): for csv_line, error_type, error_msg, multivalent, feature_spec in ( self._DECODE_ERROR_CASES): schema = dataset_schema.from_feature_spec({'x': feature_spec}) if isinstance(feature_spec, tf.SparseFeature): columns = [feature_spec.index_key, feature_spec.value_key] else: columns = 'x' with self.assertRaisesRegexp(error_type, error_msg, msg=self._msg_for_decode_case( csv_line, feature_spec)): # We don't distinguish between errors in the coder constructor and in # the decode method. if multivalent: coder = csv_coder.CsvCoder(columns, schema, secondary_delimiter='|', multivalent_columns=columns) else: coder = csv_coder.CsvCoder(columns, schema) coder.decode(csv_line)
def test_picklable(self): csv_line = '12,"this is a ,text",categorical_value,1,89.0,12.0,20' instance = { 'category1': [b'categorical_value'], 'numeric1': 12, 'numeric2': [89.0], 'numeric3': [20], 'text1': b'this is a ,text', 'y': ([1], [12.0]) } schema = dataset_schema.from_feature_spec(_FEATURE_SPEC) coder = csv_coder.CsvCoder(_COLUMNS, schema) # Repeat twice to ensure the act of encoding/decoding doesn't break # pickling. for _ in range(2): coder = pickle.loads(pickle.dumps(coder)) self.assertEqual(coder.decode(csv_line), instance) self.assertEqual(coder.encode(instance), csv_line.encode('utf-8'))
def test_sparse_feature_incorrect_values(self): input_schema = dataset_schema.from_feature_spec({ 'a': tf.SparseFeature('idx', 'value', tf.float32, 10), }) coder = csv_coder.CsvCoder(column_names=['idx', 'value'], schema=input_schema) # Index negative. with self.assertRaisesRegexp(ValueError, 'has index -1 out of range'): coder.decode('-1,12.0') # Index equal to size. with self.assertRaisesRegexp(ValueError, 'has index 10 out of range'): coder.decode('10,12.0') # Index greater than size. with self.assertRaisesRegexp(ValueError, 'has index 11 out of range'): coder.decode('11,12.0')
def test_valency(self): data = '11|12,"this is a ,text",female|male,1|3,89.0|91.0,12.0|15.0' feature_spec = self._INPUT_SCHEMA.as_feature_spec().copy() feature_spec['numeric1'] = tf.FixedLenFeature(shape=[2], dtype=tf.int32) schema = dataset_schema.from_feature_spec(feature_spec) multivalent_columns = ['numeric1', 'numeric2', 'y'] coder = csv_coder.CsvCoder(self._COLUMNS, schema, delimiter=',', secondary_delimiter='|', multivalent_columns=multivalent_columns) expected_decoded = { 'category1': ['female|male'], 'numeric1': [11, 12], 'numeric2': [89.0, 91.0], 'text1': 'this is a ,text', 'y': ([12.0, 15.0], [1, 3]) } self._assert_encode_decode(coder, data, expected_decoded)
def test_picklable(self): encoded_data = '12,"this is a ,text",female,1,89.0,12.0' expected_decoded = { 'category1': ['female'], 'numeric1': 12, 'numeric2': [89.0], 'text1': 'this is a ,text', 'y': ([12.0], [1]) } coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) # Ensure we can pickle right away. coder = pickle.loads(pickle.dumps(coder)) self._assert_encode_decode(coder, encoded_data, expected_decoded) # And after use. coder = pickle.loads(pickle.dumps(coder)) self._assert_encode_decode(coder, encoded_data, expected_decoded)
def test_sparse_feature_missing_values(self): input_schema = dataset_schema.from_feature_spec({ 'a': tf.SparseFeature('idx', 'value', tf.float32, 10), }) coder = csv_coder.CsvCoder(column_names=['idx', 'value'], schema=input_schema) # Missing both value and index (which is allowed). self.assertEqual(coder.decode(','), {'a': ([], [])}) # Missing index only (not allowed). with self.assertRaisesRegexp(ValueError, 'expected an index in column "idx"'): coder.decode(',12.0') # Missing value only (not allowed). with self.assertRaisesRegexp(ValueError, 'expected a value in column "value"'): coder.decode('1,')
def test_all_values_present(self): columns = ['a', 'b', 'c', 'd', 'e'] input_schema = dataset_schema.from_feature_spec({ 'b': tf.FixedLenFeature(shape=[], dtype=tf.float32), 'a': tf.FixedLenFeature(shape=[], dtype=tf.string), 'c': tf.VarLenFeature(dtype=tf.string), 'y': tf.SparseFeature('d', 'e', tf.float32, 10), }) coder = csv_coder.CsvCoder(column_names=columns, schema=input_schema) self.assertEqual( coder.decode('a_value,1.0,0,1,12.0'), # Column 'c' is specified as a string so the value is not casted. { 'a': 'a_value', 'b': 1.0, 'c': ['0'], 'y': ([12.0], [1]) })
def test_decode_errors(self): input_schema = dataset_schema.from_feature_spec({ 'b': tf.FixedLenFeature(shape=[], dtype=tf.float32), 'a': tf.FixedLenFeature(shape=[], dtype=tf.string), }) coder = csv_coder.CsvCoder(column_names=['a', 'b'], schema=input_schema) # Test non-numerical column value. with self.assertRaisesRegexp( ValueError, 'could not convert string to float: b_value'): coder.decode('a_value, b_value') # Test bad csv. with self.assertRaisesRegexp(csv_coder.DecodeError, 'string or Unicode object, int found'): coder.decode(123) # Test extra column. with self.assertRaisesRegexp( csv_coder.DecodeError, 'Columns do not match specified csv headers'): coder.decode('1,2,') # Test missing column. with self.assertRaisesRegexp( csv_coder.DecodeError, 'Columns do not match specified csv headers'): coder.decode('a_value') # Test empty row. with self.assertRaisesRegexp( csv_coder.DecodeError, 'Columns do not match specified csv headers'): coder.decode('')
def test_column_not_found(self): with self.assertRaisesRegexp(ValueError, 'Column not found: '): csv_coder.CsvCoder([], self._INPUT_SCHEMA)
def test_encode(self, columns, feature_spec, csv_line, instance, **kwargs): schema = dataset_schema.from_feature_spec(feature_spec) coder = csv_coder.CsvCoder(columns, schema, **kwargs) self.assertEqual(coder.encode(instance), csv_line.encode('utf-8'))
def test_decode(self, columns, feature_spec, csv_line, instance, **kwargs): schema = dataset_schema.from_feature_spec(feature_spec) coder = csv_coder.CsvCoder(columns, schema, **kwargs) np.testing.assert_equal(coder.decode(csv_line), instance)