예제 #1
0
    def test_heterogeneous_configs(self):
        expected1 = pandas.DataFrame(data={'x': [3, 10]})
        expected2 = pandas.DataFrame(data={'y': [4]})
        expected3 = pandas.DataFrame(data={'x': ['b'], 'y': ['a']})
        populate()
        config1 = LoadConfig(source='dataframe',
                             destination='dataset',
                             dataframe=expected1,
                             data_name='a10')
        config2 = LoadConfig(source='query',
                             destination='dataframe',
                             query='select 4 as y')
        config3 = LoadConfig(source='query',
                             destination='bucket',
                             query="select 'b' as x, 'a' as y",
                             data_name='a11')
        gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path)
        load_results = gpl.multi_load([config1, config2, config3])
        self.assertEqual(len(load_results), 3)
        self.assertTrue(load_results[0] is None)
        self.assertTrue(load_results[2] is None)

        computed1 = load.dataset_to_dataframe('a10')
        self.assert_pandas_equal(expected1, computed1)

        computed2 = load_results[1]
        self.assert_pandas_equal(expected2, computed2)

        blob_name = ids.build_blob_name_2('a11-000000000000.csv.gz')
        computed3 = load.bucket_to_dataframe(blob_name, decompress=True)
        self.assert_pandas_equal(expected3, computed3)
예제 #2
0
 def test_query_to_dataframe(self):
     expected = pandas.DataFrame(data={'x': [3, 2], 'y': ['a', 'b']})
     populate()
     gpl = create_loader(separator='#')
     computed = gpl.load(
         source='query',
         destination='dataframe',
         query="select 3 as x, 'a' as y union all select 2 as x, 'b' as y")
     self.assert_pandas_equal(expected, computed)
예제 #3
0
 def test_dataframe_to_dataset(self):
     expected = pandas.DataFrame(data={'x': [1, 2, 3], 'y': [1, 2, 4]})
     populate()
     gpl = create_loader_quick_setup()
     gpl.load(source='dataframe',
              destination='dataset',
              dataframe=expected,
              data_name='a1')
     computed = load.dataset_to_dataframe('a1')
     self.assert_pandas_equal(expected, computed)
예제 #4
0
 def test_config_repeated(self):
     expected = pandas.DataFrame(data={'x': [3]})
     populate()
     config = LoadConfig(source='query',
                         destination='dataframe',
                         query='select 3 as x')
     gpl = create_loader_quick_setup(
         local_dir_path=constants.local_subdir_path)
     computeds = gpl.multi_load(configs=[config] * 3)
     for computed in computeds:
         self.assert_pandas_equal(expected, computed)
예제 #5
0
 def test_dataframe_to_bucket(self):
     expected = pandas.DataFrame(data={'x': [1, 2, 3], 'y': [1, 2, 4]})
     populate()
     gpl = create_loader()
     gpl.load(source='dataframe',
              destination='bucket',
              dataframe=expected,
              data_name='a1')
     blob_name = ids.build_blob_name_0('a1.csv.gz')
     computed = load.bucket_to_dataframe(blob_name, decompress=True)
     self.assert_pandas_equal(expected, computed)
예제 #6
0
 def test_bucket_to_dataframe(self):
     expected = pandas.DataFrame(data={'x': [3, 2], 'y': ['a', 'b']})
     populate()
     blob_name = ids.build_blob_name_2('a10')
     load.dataframe_to_bucket(expected, blob_name)
     gpl = create_loader(bq_client=None,
                         dataset_id=None,
                         bucket_dir_path=constants.bucket_subdir_path,
                         local_dir_path=constants.local_subdir_path)
     computed = gpl.load(source='bucket',
                         destination='dataframe',
                         data_name='a10')
     self.assert_pandas_equal(expected, computed)
예제 #7
0
 def test_post_clear_dataframe_to_dataset(self):
     populate()
     blob_name = ids.build_blob_name_2('a10')
     local_file_path = ids.build_local_file_path_0('a10')
     self.assertTrue(exist.blob_exists(blob_name))
     self.assertTrue(exist.local_file_exists(local_file_path))
     gpl = create_loader_quick_setup(
         bucket_dir_path=constants.bucket_subdir_path)
     gpl.load(source='dataframe',
              destination='dataset',
              dataframe=pandas.DataFrame(data={'x': [1]}),
              data_name='a10')
     self.assertFalse(exist.blob_exists(blob_name))
     self.assertFalse(exist.local_file_exists(local_file_path))
예제 #8
0
 def test_post_clear_query_to_dataframe(self):
     populate()
     blob_name = ids.build_blob_name_0('a10')
     local_file_path = ids.build_local_file_path_1('a10')
     self.assertTrue(exist.table_exists('a10'))
     self.assertTrue(exist.blob_exists(blob_name))
     self.assertTrue(exist.local_file_exists(local_file_path))
     gpl = create_loader(local_dir_path=constants.local_subdir_path)
     gpl.load(source='query',
              destination='dataframe',
              query='select 3',
              data_name='a10')
     self.assertFalse(exist.table_exists('a10'))
     self.assertFalse(exist.blob_exists(blob_name))
     self.assertFalse(exist.local_file_exists(local_file_path))
예제 #9
0
 def test_upload_download(self):
     expected = pandas.DataFrame(data={'x': [1], 'y': [3]})
     populate()
     gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path,
                         separator='#',
                         chunk_size=2**18,
                         timeout=15)
     gpl.load(source='dataframe',
              destination='dataset',
              dataframe=expected,
              data_name='a9')
     query = f'select * from {constants.dataset_id}.a9'
     computed = gpl.load(source='query',
                         destination='dataframe',
                         query=query)
     self.assert_pandas_equal(expected, computed)
예제 #10
0
 def test_no_skip_blank_lines(self):
     df0 = pandas.DataFrame(data={'x': [3, numpy.nan]})
     df1 = pandas.DataFrame(data={'x': [numpy.nan, 4]})
     df2 = pandas.DataFrame(data={'x': [numpy.nan, 5], 'y': [numpy.nan, 6]})
     df3 = pandas.DataFrame(data={'x': [7, numpy.nan], 'y': [8, numpy.nan]})
     expecteds = [df0, df1, df2, df3]
     populate()
     query0 = 'select 3 as x union all select null as x'
     query1 = 'select null as x union all select 4 as x'
     query2 = 'select null as x, null as y union all ' \
              'select 5 as x, 6 as y'
     query3 = 'select 7 as x, 8 as y union all ' \
              'select null as x, null as y'
     queries = [query0, query1, query2, query3]
     configs = []
     for query in queries:
         config = LoadConfig(source='query',
                             destination='dataframe',
                             query=query)
         configs.append(config)
     gpl = create_loader()
     computed = gpl.multi_load(configs)
     for df, dg in zip(expecteds, computed):
         self.assert_pandas_equal(df, dg)