def test_raise_error_if_configs_is_not_a_list(self): config = LoadConfig(source='bucket', destination='local', data_name='a1') with self.assertRaises(ValueError) as cm: create_loader().multi_load(configs={config}) self.assertEqual('configs must be a list', str(cm.exception))
def test_list_local_file_paths(self): populate_local() gpl20 = create_loader(bucket_dir_path=constants.bucket_subdir_path) gpl21 = create_loader(bucket_dir_path=constants.bucket_subdir_path, local_dir_path=constants.local_subdir_path) local_file_paths = [ ids.build_local_file_path_0(f'a{i}') for i in range(7, 12) ] local_file_paths = sorted(local_file_paths) expected = [os.path.normpath(p) for p in local_file_paths] computed = [ os.path.normpath(p) for p in gpl20.list_local_file_paths('a') ] self.assertEqual(expected, computed) local_file_paths = [ ids.build_local_file_path_1(f'a{i}') for i in range(10, 13) ] local_file_paths = sorted(local_file_paths) expected = [os.path.normpath(p) for p in local_file_paths] computed = [ os.path.normpath(p) for p in gpl21.list_local_file_paths('a1') ] self.assertEqual(expected, computed) self.assertEqual([], gpl21.list_local_file_paths('sub'))
def test_list_blob_uris(self): populate_bucket() gpl00 = create_loader() gpl10 = create_loader(bucket_dir_path=constants.bucket_dir_path) gpl20 = create_loader_quick_setup( bucket_dir_path=constants.bucket_subdir_path, separator='#') blob_names = [ids.build_blob_name_0(f'a{i}') for i in range(7, 12)] blob_uris = [ids.build_bucket_uri(n) for n in blob_names] blob_uris = sorted(blob_uris) self.assertEqual(blob_uris, gpl00.list_blob_uris('a')) blob_names = [ids.build_blob_name_1(f'a{i}') for i in range(10, 13)] blob_uris = [ids.build_bucket_uri(n) for n in blob_names] blob_uris = sorted(blob_uris) self.assertEqual(blob_uris, gpl10.list_blob_uris('a1')) blob_names = [ids.build_blob_name_2(f'a{i}') for i in range(9, 14)] blob_uris = [ids.build_bucket_uri(n) for n in blob_names] blob_uris = sorted(blob_uris) self.assertEqual(blob_uris, gpl20.list_blob_uris('a')) self.assertEqual([], gpl00.list_blob_uris('dir')) self.assertEqual([], gpl10.list_blob_uris('su'))
def test_raise_error_if_dataset_id_not_contain_exactly_one_dot(self): msg = 'dataset_id must contain exactly one dot' with self.assertRaises(ValueError) as cm: create_loader(dataset_id='ab') self.assertEqual(msg, str(cm.exception)) with self.assertRaises(ValueError) as cm: create_loader(dataset_id='a.b.c') self.assertEqual(msg, str(cm.exception))
def test_raise_error_if_write_empty_and_already_exists(self): populate_dataset() populate_local() with self.assertRaises(Conflict) as cm: create_loader().load(source='local', destination='dataset', data_name='a10', write_disposition='WRITE_EMPTY') self.assertEqual(str(cm.exception), '409 Already Exists: Table dmp-y-tests:test_gpl.a10')
def test_raise_error_if_prefix(self): config1 = LoadConfig(source='dataframe', destination='dataset', dataframe=pandas.DataFrame(data={'x': [3]}), data_name='a') config2 = LoadConfig(source='query', destination='dataframe', query='select 4 as y', data_name='aa') with self.assertRaises(ValueError) as cm: create_loader().multi_load(configs=[config1, config2]) self.assertEqual('a is a prefix of aa', str(cm.exception))
def test_exist_in_local(self): gpl00 = create_loader() gpl01 = create_loader(bq_client=None, dataset_id=None, gs_client=None, bucket_name=None, bucket_dir_path='bucket_dir_path', local_dir_path=constants.local_subdir_path) self.assertFalse(gpl00.exist_in_local('a')) self.assertFalse(gpl01.exist_in_local('a9')) populate_local() self.assertTrue(gpl00.exist_in_local('a')) self.assertTrue(gpl01.exist_in_local('a9'))
def test_exist_in_bucket(self): gpl01 = create_loader(local_dir_path=constants.local_subdir_path) gpl11 = create_loader(bucket_dir_path=constants.bucket_dir_path, local_dir_path=constants.local_subdir_path) gpl21 = create_loader_quick_setup( dataset_name=None, bucket_dir_path=constants.bucket_subdir_path, local_dir_path=constants.local_subdir_path) self.assertFalse(gpl01.exist_in_bucket('a1')) self.assertFalse(gpl11.exist_in_bucket('a10')) self.assertFalse(gpl21.exist_in_bucket('a')) populate_bucket() self.assertTrue(gpl01.exist_in_bucket('a1')) self.assertTrue(gpl11.exist_in_bucket('a10')) self.assertTrue(gpl21.exist_in_bucket('a'))
def test_heterogeneous_configs(self): expected1 = pandas.DataFrame(data={'x': [3, 10]}) expected2 = pandas.DataFrame(data={'y': [4]}) expected3 = pandas.DataFrame(data={'x': ['b'], 'y': ['a']}) populate() config1 = LoadConfig(source='dataframe', destination='dataset', dataframe=expected1, data_name='a10') config2 = LoadConfig(source='query', destination='dataframe', query='select 4 as y') config3 = LoadConfig(source='query', destination='bucket', query="select 'b' as x, 'a' as y", data_name='a11') gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path) load_results = gpl.multi_load([config1, config2, config3]) self.assertEqual(len(load_results), 3) self.assertTrue(load_results[0] is None) self.assertTrue(load_results[2] is None) computed1 = load.dataset_to_dataframe('a10') self.assert_pandas_equal(expected1, computed1) computed2 = load_results[1] self.assert_pandas_equal(expected2, computed2) blob_name = ids.build_blob_name_2('a11-000000000000.csv.gz') computed3 = load.bucket_to_dataframe(blob_name, decompress=True) self.assert_pandas_equal(expected3, computed3)
def test_dataset_to_local(self): expected = pandas.DataFrame(data={'x': [1, 2, 3, 4]}) load.multi_dataframe_to_dataset([expected], ['b1']) gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path, local_dir_path=constants.local_subdir_path) gpl.load(source='dataset', destination='local', data_name='b1') local_file_path = ids.build_local_file_path_1('b1-000000000000.csv.gz') computed = load.local_to_dataframe(local_file_path) self.assert_pandas_equal(expected, computed)
def test_query_to_dataframe(self): expected = pandas.DataFrame(data={'x': [3, 2], 'y': ['a', 'b']}) populate() gpl = create_loader(separator='#') computed = gpl.load( source='query', destination='dataframe', query="select 3 as x, 'a' as y union all select 2 as x, 'b' as y") self.assert_pandas_equal(expected, computed)
def test_dataset_to_bucket(self): expected = pandas.DataFrame(data={'x': ['a8_dataset']}) populate_dataset() gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path, local_dir_path=None) gpl.load(source='dataset', destination='bucket', data_name='a8') blob_name = ids.build_blob_name_2('a8-000000000000.csv.gz') computed = load.bucket_to_dataframe(blob_name, decompress=True) self.assert_pandas_equal(expected, computed)
def test_local_to_dataframe(self): expected = pandas.DataFrame( data={'x': [f'a{i}_local' for i in range(10, 13)]}) populate_local() gpl = create_loader(bucket_dir_path=constants.bucket_dir_path, local_dir_path=constants.local_subdir_path) computed = gpl.load(source='local', destination='dataframe', data_name='a1') self.assert_pandas_equal(expected, computed)
def test_call_loader_getters(self): gpl00 = create_loader() gpl10 = create_loader(bucket_dir_path=bucket_dir_path) gpl20 = create_loader(bucket_dir_path=bucket_subdir_path) gpl01 = create_loader_quick_setup(project_id=None, dataset_name=None, bucket_name=None, local_dir_path=local_subdir_path) self.assertIsNotNone(gpl00.bq_client) self.assertIsNotNone(gpl00.gs_client) self.assertIsNotNone(gpl00.bucket) self.assertEqual(dataset_id, gpl00.dataset_id) self.assertEqual(dataset_name, gpl00.dataset_name) self.assertEqual(bucket_name, gpl00.bucket_name) self.assertIsNone(gpl00.bucket_dir_path) self.assertEqual(bucket_dir_path, gpl10.bucket_dir_path) self.assertEqual(bucket_subdir_path, gpl20.bucket_dir_path) self.assertEqual(local_dir_path, gpl00.local_dir_path) self.assertEqual(local_subdir_path, gpl01.local_dir_path)
def test_dataframe_to_bucket(self): expected = pandas.DataFrame(data={'x': [1, 2, 3], 'y': [1, 2, 4]}) populate() gpl = create_loader() gpl.load(source='dataframe', destination='bucket', dataframe=expected, data_name='a1') blob_name = ids.build_blob_name_0('a1.csv.gz') computed = load.bucket_to_dataframe(blob_name, decompress=True) self.assert_pandas_equal(expected, computed)
def test_write_empty_local_to_dataset(self): expected = pandas.DataFrame(data={'x': [1]}) local_file_path = ids.build_local_file_path_1('s12') load.dataframe_to_local(expected, local_file_path) gpl = create_loader(local_dir_path=constants.local_subdir_path) gpl.load(source='local', destination='dataset', data_name='s12', write_disposition='WRITE_EMPTY') computed = load.dataset_to_dataframe('s12') self.assert_pandas_equal(expected, computed)
def test_query_to_dataset(self): expected = pandas.DataFrame(data={'x': [3, 2], 'y': ['a', 'b']}) populate_dataset() gpl = create_loader(gs_client=None, bucket_name=None) gpl.load( source='query', destination='dataset', query="select 3 as x, 'a' as y union all select 2 as x, 'b' as y", data_name='a0') computed = load.dataset_to_dataframe('a0') self.assert_pandas_equal(expected, computed)
def test_compress_query_to_bucket(self): gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path) gpl.load( source='query', destination='bucket', query='select 5', data_name='b100') blob_name = ids.build_blob_name_2('b100-000000000000.csv.gz') local_file_path = ids.build_local_file_path_1( 'b100-000000000000.csv.gz') load.bucket_to_local(blob_name, local_file_path) self.assertTrue(is_gz_file(local_file_path))
def test_list_blobs(self): populate_bucket() gpl00 = create_loader_quick_setup() gpl10 = create_loader(bucket_dir_path=constants.bucket_dir_path) gpl20 = create_loader(bucket_dir_path=constants.bucket_subdir_path) self.assertEqual( sorted([ids.build_blob_name_0(f'a{i}') for i in range(7, 12)]), [b.name for b in gpl00.list_blobs('a')]) self.assertEqual( sorted([ids.build_blob_name_1(f'a{i}') for i in range(10, 13)]), [b.name for b in gpl10.list_blobs('a1')]) self.assertEqual( sorted([ids.build_blob_name_2(f'a{i}') for i in range(9, 14)]), [b.name for b in gpl20.list_blobs('')]) self.assertEqual([], gpl00.list_blobs('dir')) self.assertEqual([], gpl10.list_blobs('su'))
def test_raise_error_if_missing_required_resources(self): with self.assertRaises(ValueError) as cm: create_loader(bq_client=None, dataset_id=None).load(source='query', destination='dataset', data_name='e0', query='select 3') self.assertEqual('bq_client must be provided if dataset is used', str(cm.exception)) with self.assertRaises(ValueError) as cm: create_loader(gs_client=None, bucket_name=None).load(source='bucket', destination='local', data_name='a') self.assertEqual('gs_client must be provided if bucket is used', str(cm.exception)) with self.assertRaises(ValueError) as cm: create_loader(local_dir_path=None).load( source='dataframe', destination='local', dataframe=pandas.DataFrame(data={'x': [1]}), data_name='a') self.assertEqual('local_dir_path must be provided if local is used', str(cm.exception))
def test_bucket_to_dataframe(self): expected = pandas.DataFrame(data={'x': [3, 2], 'y': ['a', 'b']}) populate() blob_name = ids.build_blob_name_2('a10') load.dataframe_to_bucket(expected, blob_name) gpl = create_loader(bq_client=None, dataset_id=None, bucket_dir_path=constants.bucket_subdir_path, local_dir_path=constants.local_subdir_path) computed = gpl.load(source='bucket', destination='dataframe', data_name='a10') self.assert_pandas_equal(expected, computed)
def test_download_upload(self): expected = pandas.DataFrame(data={'x': [3, 2]}) gpl = create_loader(bucket_dir_path=constants.bucket_dir_path, local_dir_path=constants.local_subdir_path) df0 = gpl.load(source='query', destination='dataframe', query='select 3 as x union all select 2 as x') gpl.load(source='dataframe', destination='dataset', dataframe=df0, data_name='b1') computed = load.dataset_to_dataframe('b1') self.assert_pandas_equal(expected, computed)
def test_query_to_bucket(self): with self.assertLogs('google_pandas_load.loader', level='DEBUG') as cm: gpl = create_loader(bucket_dir_path=constants.bucket_dir_path, local_dir_path=None) gpl.load(source='query', destination='bucket', query='select 3', data_name='a0') records = cm.records self.assertEqual(4, len(records)) regexp = (r'^google_pandas_load.loader # DEBUG # ' r'Ended query to dataset \[[0-9]+s, [0-9]+\.[0-9]+\$\]$') pattern = re.compile(regexp) log = formatter.format(records[1]) self.assertIsNotNone(pattern.search(log))
def test_post_clear_query_to_dataframe(self): populate() blob_name = ids.build_blob_name_0('a10') local_file_path = ids.build_local_file_path_1('a10') self.assertTrue(exist.table_exists('a10')) self.assertTrue(exist.blob_exists(blob_name)) self.assertTrue(exist.local_file_exists(local_file_path)) gpl = create_loader(local_dir_path=constants.local_subdir_path) gpl.load(source='query', destination='dataframe', query='select 3', data_name='a10') self.assertFalse(exist.table_exists('a10')) self.assertFalse(exist.blob_exists(blob_name)) self.assertFalse(exist.local_file_exists(local_file_path))
def test_write_append_dataframe_to_dataset(self): expected = pandas.DataFrame(data={'x': [0, 1]}) df00 = pandas.DataFrame(data={'x': [0]}) df01 = pandas.DataFrame(data={'x': [1]}) gpl = create_loader(chunk_size=2**18, timeout=5) gpl.load(source='dataframe', destination='dataset', dataframe=df00, data_name='s13') gpl.load(source='dataframe', destination='dataset', dataframe=df01, data_name='s13', write_disposition='WRITE_APPEND') computed = load.dataset_to_dataframe('s13') self.assert_pandas_equal(expected, computed)
def test_upload_download(self): expected = pandas.DataFrame(data={'x': [1], 'y': [3]}) populate() gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path, separator='#', chunk_size=2**18, timeout=15) gpl.load(source='dataframe', destination='dataset', dataframe=expected, data_name='a9') query = f'select * from {constants.dataset_id}.a9' computed = gpl.load(source='query', destination='dataframe', query=query) self.assert_pandas_equal(expected, computed)
def test_raise_error_if_data_name_contains_slash(self): with self.assertRaises(ValueError) as cm: create_loader().list_blobs(data_name='a/b') msg = 'data_name=a/b must not contain a /' self.assertEqual(msg, str(cm.exception)) with self.assertRaises(ValueError) as cm: create_loader().list_blob_uris(data_name='a/b') msg = 'data_name=a/b must not contain a /' self.assertEqual(msg, str(cm.exception)) with self.assertRaises(ValueError) as cm: create_loader().list_local_file_paths(data_name='a/b') msg = 'data_name=a/b must not contain a /' self.assertEqual(msg, str(cm.exception))
def test_no_skip_blank_lines(self): df0 = pandas.DataFrame(data={'x': [3, numpy.nan]}) df1 = pandas.DataFrame(data={'x': [numpy.nan, 4]}) df2 = pandas.DataFrame(data={'x': [numpy.nan, 5], 'y': [numpy.nan, 6]}) df3 = pandas.DataFrame(data={'x': [7, numpy.nan], 'y': [8, numpy.nan]}) expecteds = [df0, df1, df2, df3] populate() query0 = 'select 3 as x union all select null as x' query1 = 'select null as x union all select 4 as x' query2 = 'select null as x, null as y union all ' \ 'select 5 as x, 6 as y' query3 = 'select 7 as x, 8 as y union all ' \ 'select null as x, null as y' queries = [query0, query1, query2, query3] configs = [] for query in queries: config = LoadConfig(source='query', destination='dataframe', query=query) configs.append(config) gpl = create_loader() computed = gpl.multi_load(configs) for df, dg in zip(expecteds, computed): self.assert_pandas_equal(df, dg)
def test_raise_error_if_no_data(self): with self.assertRaises(ValueError) as cm: create_loader().load(source='dataset', destination='local', data_name='e0') self.assertEqual('There is no data named e0 in dataset', str(cm.exception)) with self.assertRaises(ValueError) as cm: create_loader().load(source='bucket', destination='dataset', data_name='e0') self.assertEqual('There is no data named e0 in bucket', str(cm.exception)) with self.assertRaises(ValueError) as cm: create_loader().load(source='local', destination='dataframe', data_name='e0') self.assertEqual('There is no data named e0 in local', str(cm.exception))
def test_keep_source_in_dataset(self): populate_dataset() gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path, local_dir_path=constants.local_subdir_path) gpl.load(source='dataset', destination='local', data_name='a7') self.assertTrue(exist.table_exists('a7'))