def test_build_query_components_when_no_pcoll_queried(self): query = """SELECT CAST(1 AS INT) AS `id`, CAST('foo' AS VARCHAR) AS `str`, CAST(3.14 AS DOUBLE) AS `flt`""" processed_query, sql_source = _build_query_components(query, {}) self.assertEqual(processed_query, query) self.assertIsInstance(sql_source, beam.Pipeline)
def test_build_query_components_when_no_pcoll_queried(self): query = """SELECT CAST(1 AS INT) AS `id`, CAST('foo' AS VARCHAR) AS `str`, CAST(3.14 AS DOUBLE) AS `flt`""" processed_query, sql_source, chain = _build_query_components( query, {}, 'output') self.assertEqual(processed_query, query) self.assertIsInstance(sql_source, beam.Pipeline) self.assertIsInstance(chain.current.source, beam.Pipeline) self.assertEqual('output', chain.current.output_name) self.assertEqual(query, chain.current.query)
def test_build_query_components_when_unbounded_pcolls_queried(self): p = beam.Pipeline() pcoll = p | beam.io.ReadFromPubSub( subscription='projects/fake-project/subscriptions/fake_sub') ib.watch(locals()) query = 'SELECT * FROM pcoll' found = {'pcoll': pcoll} with patch( 'apache_beam.runners.interactive.sql.beam_sql_magics.' 'pcolls_from_streaming_cache', lambda a, b, c: found): _, sql_source = _build_query_components(query, found) self.assertIs(sql_source, pcoll)
def test_build_query_components_when_single_pcoll_queried(self): p = beam.Pipeline() target = p | beam.Create([1, 2, 3]) ib.watch(locals()) query = 'SELECT * FROM target where a=1' found = {'target': target} with patch( 'apache_beam.runners.interactive.sql.beam_sql_magics.' 'pcoll_from_file_cache', lambda a, b, c, d: target): processed_query, sql_source = _build_query_components(query, found) self.assertEqual(processed_query, 'SELECT * FROM PCOLLECTION where a=1') self.assertIsInstance(sql_source, beam.PCollection)
def test_build_query_components_when_multiple_pcolls_queried(self): p = beam.Pipeline() pcoll_1 = p | 'Create 1' >> beam.Create([1, 2, 3]) pcoll_2 = p | 'Create 2' >> beam.Create([4, 5, 6]) ib.watch(locals()) query = 'SELECT * FROM pcoll_1 JOIN pcoll_2 USING (a)' found = {'pcoll_1': pcoll_1, 'pcoll_2': pcoll_2} with patch( 'apache_beam.runners.interactive.sql.beam_sql_magics.' 'pcoll_from_file_cache', lambda a, b, c, d: pcoll_1): processed_query, sql_source = _build_query_components(query, found) self.assertEqual(processed_query, query) self.assertIsInstance(sql_source, dict) self.assertIn('pcoll_1', sql_source) self.assertIn('pcoll_2', sql_source)
def test_build_query_components_when_single_pcoll_queried(self): p = beam.Pipeline() target = p | beam.Create([1, 2, 3]) ib.watch(locals()) query = 'SELECT * FROM target where a=1' found = {'target': target} with patch( 'apache_beam.runners.interactive.sql.beam_sql_magics.' 'unreify_from_cache', lambda pipeline, cache_key, cache_manager, element_type: target): processed_query, sql_source, chain = _build_query_components( query, found, 'output') expected_query = 'SELECT * FROM PCOLLECTION where a=1' self.assertEqual(expected_query, processed_query) self.assertIsInstance(sql_source, beam.PCollection) self.assertIn('target', chain.current.source) self.assertEqual(expected_query, chain.current.query) self.assertEqual('output', chain.current.output_name)
def test_build_query_components_when_multiple_pcolls_queried(self): p = beam.Pipeline() pcoll_1 = p | 'Create 1' >> beam.Create([1, 2, 3]) pcoll_2 = p | 'Create 2' >> beam.Create([4, 5, 6]) ib.watch(locals()) query = 'SELECT * FROM pcoll_1 JOIN pcoll_2 USING (a)' found = {'pcoll_1': pcoll_1, 'pcoll_2': pcoll_2} with patch( 'apache_beam.runners.interactive.sql.beam_sql_magics.' 'unreify_from_cache', lambda pipeline, cache_key, cache_manager, element_type: pcoll_1): processed_query, sql_source, chain = _build_query_components( query, found, 'output') self.assertEqual(processed_query, query) self.assertIsInstance(sql_source, dict) self.assertIn('pcoll_1', sql_source) self.assertIn('pcoll_2', sql_source) self.assertIn('pcoll_1', chain.current.source) self.assertIn('pcoll_2', chain.current.source) self.assertEqual(query, chain.current.query) self.assertEqual('output', chain.current.output_name)