def test_unsupported_schema(self): """Test supported schema values. Note that not all valid BQ schema values are valid/used in the structured data package """ unsupported_col_types = ['bytes', 'boolean', 'timestamp', 'date', 'time', 'datetime', 'record'] for col_type in unsupported_col_types: schema = 'col_name:%s' % col_type job = core_sd.analyze_async( 'some_dir', dlml.CsvDataSet( file_pattern=['file1.txt'], schema=schema), cloud=False).wait() self.assertIn('Schema contains an unsupported type %s.' % col_type, job.fatal_error.message) job = core_sd.analyze_async( 'gs://some_dir', dlml.CsvDataSet( file_pattern=['gs://file1.txt'], schema=schema), cloud=True, project_id='junk_project_id').wait() self.assertIn('Schema contains an unsupported type %s.' % col_type, job.fatal_error.message)
def test_cloud_but_local_files(self): job = core_sd.analyze_async( 'gs://some_dir', dlml.CsvDataSet( file_pattern=['file1.txt'], schema='col1:STRING,col2:INTEGER,col3:FLOAT'), project_id='project_id', cloud=True).wait() self.assertIn('File file1.txt is not a gcs path', job.fatal_error.message)
def test_csvdataset_one_file(self): """Test CsvDataSet has only one file/pattern""" # TODO(brandondutra) remove this restriction job = core_sd.analyze_async( 'some_dir', dlml.CsvDataSet( file_pattern=['file1.txt', 'file2.txt'], schema='col1:STRING,col2:INTEGER,col3:FLOAT')).wait() self.assertIn('should be built with a file pattern', job.fatal_error.message)
def test_projectid(self): """Test passing project id but cloud is false""" job = core_sd.analyze_async( 'some_dir', dlml.CsvDataSet( file_pattern=['file1.txt'], schema='col1:STRING,col2:INTEGER,col3:FLOAT'), project_id='project_id').wait() self.assertIn('project_id only needed if cloud is True', job.fatal_error.message)
def test_not_csvdataset(self): """Test csvdataset is used""" # not a CsvDataSet job = core_sd.analyze_async('some_dir', 'some_file.txt').wait() self.assertIn('Only CsvDataSet is supported', job.fatal_error.message)