def test_sqlite_product_fetch_metadata_none_if_not_exists(tmp_directory): tmp = Path(tmp_directory) conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db")) numbers = SQLiteRelation((None, 'numbers', 'table'), conn) numbers.render({}) assert numbers.fetch_metadata() is None
def test_sqlite_product_fetch_metadata_none_if_empty_metadata(tmp_directory): tmp = Path(tmp_directory) conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db")) df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', conn.engine) numbers = SQLiteRelation((None, 'numbers', 'table'), conn) numbers.render({}) assert numbers.fetch_metadata() is None
def client_and_prod(request, sqlite_client_and_tmp_dir, pg_client_and_schema): # Based on: https://github.com/pytest-dev/pytest/issues/349#issue-88534390 if request.param == 'sqlite': client, _ = sqlite_client_and_tmp_dir product = SQLiteRelation((None, 'numbers', 'table'), client) schema = None else: client, schema = pg_client_and_schema product = PostgresRelation((schema, 'numbers', 'table'), client) yield client, product, schema product.delete()
def test_old_metadata_is_replaced(arg, sqlite_client_and_tmp_dir): client, tmp = sqlite_client_and_tmp_dir schema, name, _ = arg product = SQLiteRelation(arg, client=client) product.render({}) product.save_metadata({ 'timestamp': datetime.now().timestamp(), 'stored_source_code': 'some code' }) product.save_metadata({ 'timestamp': datetime.now().timestamp(), 'stored_source_code': 'some code' }) query = "SELECT COUNT(*) FROM _metadata WHERE name='{}'".format(name) if schema is not None: query += " AND schema='{}'".format(schema) result = list(client.engine.execute(query))[0][0] assert result == 1
def test_can_transfer_sqlite(tmp_directory): """ >>> import tempfile >>> tmp_directory = tempfile.mkdtemp() """ tmp = Path(tmp_directory) # create clientections to 2 dbs client_in = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database_in.db")) client_out = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database_out.db")) # make some data and save it in the db df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', client_in.engine, index=False) # create the task and run it dag = DAG() SQLTransfer('SELECT * FROM numbers', SQLiteRelation((None, 'numbers2', 'table'), client=client_out), dag, name='transfer', client=client_in, chunksize=10) dag.build() # load dumped data and data from the db original = pd.read_sql_query('SELECT * FROM numbers', client_in.engine) transfer = pd.read_sql_query('SELECT * FROM numbers2', client_out.engine) client_in.close() client_out.close() # make sure they are the same assert original.equals(transfer)
def test_custom_io_handler(tmp_directory): dag = DAG() client = SQLAlchemyClient('sqlite:///database.db') dag.clients[SQLUpload] = client dag.clients[SQLiteRelation] = client df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]}) df.to_csv('some-file.tsv', sep='\t', index=False) def my_reading_fn(path): return pd.read_csv(path, sep='\t') SQLUpload('some-file.tsv', SQLiteRelation(('my-table', 'table')), dag=dag, name='task', io_handler=my_reading_fn, to_sql_kwargs=dict(index=False)) dag.build() other = pd.read_sql('SELECT * FROM "my-table"', con=client) client.close() assert other.equals(df)
def test_clients_are_closed_after_build(tmp_directory): # TODO: same test but when the dag breaks (make sure clients are closed # even on that case) tmp = Path(tmp_directory) # create a db conn = sqlite3.connect(str(tmp / "database.db")) uri = 'sqlite:///{}'.format(tmp / "database.db") # make some data and save it in the db df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', conn) conn.close() # create the task and run it dag = DAG() def mock_client(): m = Mock(wraps=SQLAlchemyClient(uri)) m.split_source = ';' return m clients = [mock_client() for _ in range(4)] dag.clients[SQLScript] = clients[0] dag.clients[SQLiteRelation] = clients[1] t1 = SQLScript(""" CREATE TABLE {{product}} AS SELECT * FROM numbers """, SQLiteRelation(('another', 'table')), dag=dag, name='t1') t2 = SQLScript(""" CREATE TABLE {{product}} AS SELECT * FROM {{upstream['t1']}} """, SQLiteRelation(('yet_another', 'table'), client=clients[2]), dag=dag, name='t2', client=clients[3]) t1 >> t2 dag.build() assert all(client.close.called for client in clients)
def test_doesnt_warn_if_relations_match(sql, split_source): product = SQLiteRelation((None, 'my_table', 'table')) source = SQLScriptSource(sql, split_source=split_source) with pytest.warns(None) as record: source.render({'product': product}) assert not record
def test_sqlite_product_save_metadata(tmp_directory): tmp = Path(tmp_directory) conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db")) numbers = SQLiteRelation((None, 'numbers', 'table'), conn) numbers.render({}) numbers.metadata['timestamp'] = datetime.now().timestamp() numbers.metadata['stored_source_code'] = 'some code' numbers.save_metadata() fetched = numbers.fetch_metadata() assert fetched == numbers.metadata
def make(client): dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data', SQLiteRelation(['data2', 'table']), dag=dag, name='task') return dag
def test_warns_if_number_of_relations_does_not_match_number_of_products(): product = [ SQLiteRelation((None, 'my_table', 'table')), SQLiteRelation((None, 'another_table', 'table')) ] source = SQLScriptSource(""" CREATE TABLE {{product[0]}} AS SELECT * FROM my_table GROUP BY some_column """) with pytest.warns(UserWarning) as record: source.render({'product': product}) assert len(record) == 1 msg = ('It appears that your script will create 1 relation(s) ' 'but you declared 2 product(s): ' "[SQLiteRelation(('my_table', 'table')), " "SQLiteRelation(('another_table', 'table'))]") assert record[0].message.args[0] == msg
def test_warns_if_number_of_relations_does_not_match_products(): dag = DAG() sql = """ -- wrong sql, products must be used in CREATE statements CREATE TABLE {{product[0]}} AS SELECT * FROM my_table """ t = SQLScript(sql, [ SQLiteRelation((None, 'my_table', 'table')), SQLiteRelation((None, 'another_table', 'table')) ], dag=dag, client=Mock(), name='sql') match = r'.*will create 1 relation\(s\) but you declared 2 product\(s\).*' with pytest.warns(UserWarning, match=match): t.render()
def test_warns_if_sql_scipt_does_not_create_relation(): dag = DAG() t = SQLScript('SELECT * FROM {{product}}', SQLiteRelation((None, 'my_table', 'table')), dag=dag, client=Mock(), name='sql') match = 'will not create any tables or views but the task has product' with pytest.warns(UserWarning, match=match): t.render()
def test_dag_reports_sub_select_cols(sqlite_client_and_tmp_dir): client, _ = sqlite_client_and_tmp_dir dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client PythonCallable(touch_root, File('some_file.txt'), dag, name='task') sql = 'CREATE TABLE {{product}} AS SELECT * FROM data' SQLScript(sql, SQLiteRelation(('data2', 'table')), dag, name='task2') assert dag.status()[['name', 'Last run']] assert dag.build()[['Ran?', 'Elapsed (s)']]
def test_source_loader_and_task(sqlite_client_and_tmp_dir): client, tmp_dir = sqlite_client_and_tmp_dir Path(tmp_dir, 'data_query.sql').write_text('SELECT * FROM data') source_loader = SourceLoader(str(tmp_dir)) dag = DAG() dag.clients[SQLTransfer] = client dag.clients[SQLiteRelation] = client SQLTransfer(source_loader['data_query.sql'], product=SQLiteRelation((None, 'data2', 'table')), dag=dag, name='transfer') dag.build()
def test_warns_if_no_create_statement_found(): product = SQLiteRelation((None, 'my_table', 'table')) source = SQLScriptSource(""" -- {{product}} without CREATE statement SELECT * FROM my_table GROUP BY some_column """) with pytest.warns(UserWarning) as record: source.render({'product': product}) assert len(record) == 1 msg = ('It appears that your script will not create any tables/views ' 'but the product parameter is ' "SQLiteRelation(('my_table', 'table'))") assert record[0].message.args[0] == msg
def test_sqlite_product_delete(tmp_directory): """ >>> import tempfile >>> tmp_directory = tempfile.mkdtemp() """ tmp = Path(tmp_directory) conn = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db")) df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', conn.engine) numbers = SQLiteRelation((None, 'numbers', 'table'), conn) numbers.render({}) numbers.delete() assert not numbers.exists()
def test_sql_script_shows_executed_code_if_fails(tmp_directory, sample_data): dag = DAG() client = SQLAlchemyClient('sqlite:///database.db') dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client SQLScript('SOME INVALID SQL {{product}}', SQLiteRelation((None, 'another', 'table')), dag=dag, name='task') with pytest.raises(DAGBuildError) as excinfo: dag.build() assert 'SOME INVALID SQL' in str(excinfo.value) assert 'near "SOME": syntax error' in str(excinfo.value)
def test_warns_if_inferred_relations_do_not_match_product(): product = SQLiteRelation((None, 'my_table', 'table')) source = SQLScriptSource(""" -- using {{product}} in the wrong place CREATE TABLE some_table AS SELECT * FROM another_table """) with pytest.warns(UserWarning) as record: source.render({'product': product}) assert len(record) == 1 msg = ('It appears that your script will create relations ' '{ParsedSQLRelation((\'some_table\', \'table\'))}, ' 'which doesn\'t match ' 'products: {SQLiteRelation((\'my_table\', \'table\'))}. ' 'Make sure schema, ' 'name and kind (table or view) match') assert record[0].message.args[0] == msg
def test_hot_reload_sql_sources(class_, tmp_directory): path = Path(tmp_directory, 'script.sql') path.write_text('/*doc*/\n{{product}}') product = SQLiteRelation(('some_table', 'table')) source = class_(path, hot_reload=True) source.render({'product': product}) assert str(source) == '/*doc*/\nsome_table' assert source.variables == {'product'} assert source.doc == 'doc' path.write_text('/*new doc*/\n{{product}} {{new_tag}}') source.render({'product': product, 'new_tag': 'modified'}) assert str(source) == '/*new doc*/\nsome_table modified' assert source.variables == {'product', 'new_tag'} assert source.doc == 'new doc'
def make(env, artist_name, language): artist_name_ = artist_name.lower().replace(' ', '_') dag = DAG(executor=Serial(False)) loader = SourceLoader(path='sql', module='ploomber_lyrics') db_location = env.path.data / 'clean.db' client = DBAPIClient(sqlite3.connect, {'database': db_location}, split_source=True) dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client t = DownloadFromURL(env.source.lyrics, File(env.path.data / 'lyrics.db'), dag, name='raw_db') t2 = DownloadFromURL(env.source.metadata, File(env.path.data / 'metadata.db'), dag, name='raw_metadata_db') t3 = DownloadFromURL(env.source.artist_tags, File(env.path.data / 'artist_tags.db'), dag, name='raw_artist_tags_db') top = SQLScript(loader['get_top_words.sql'], SQLiteRelation(('top_words_{{artist_name_}}', 'table')), dag, params={'artist_name': artist_name, 'artist_name_': artist_name_}) wc = PythonCallable(tasks.wordcloud, File(env.path.data / (artist_name_ + '.png')), dag, params={'db_location': db_location, 'language': language}) (t + t2 + t3) >> top >> wc return dag
def test_ignores_non_file_products(tmp_directory, monkeypatch): mock_remote = Mock() monkeypatch.setattr(file._RemoteFile, '_fetch_remote_metadata', mock_remote) client = SQLAlchemyClient('sqlite:///my.db') dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data', SQLiteRelation(['data2', 'table']), dag=dag, name='task') dag.render() client.close() mock_remote.assert_not_called()
def test_sqlscript_load(tmp_directory, sample_data, client): dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client SQLScript('CREATE TABLE {{product}} AS SELECT * FROM numbers', SQLiteRelation((None, 'another', 'table')), dag=dag, name='task') dag.build(close_clients=False) df = dag['task'].load() dag.close_clients() assert df.to_dict(orient='list') == { 'a': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'b': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109] }
def test_can_request_params(sqlite_client_and_tmp_dir): on_finish_sql.task = None on_finish_sql.product = None on_finish_sql.client = None client, _ = sqlite_client_and_tmp_dir dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client t = SQLScript('CREATE TABLE {{product}} AS SELECT * FROM data', SQLiteRelation(('new_table', 'table')), dag, name='t') t.on_finish = on_finish_sql dag.build() assert on_finish_sql.task is t assert on_finish_sql.product is t.product assert on_finish_sql.client is client
def test_warns_if_sql_script_does_not_create_relation( sqlite_client_and_tmp_dir): client, _ = sqlite_client_and_tmp_dir dag = DAG() dag.clients[SQLiteRelation] = client mock_client = Mock() mock_client.split_source = ';' t = SQLScript('SELECT * FROM {{product}}', SQLiteRelation((None, 'my_table', 'table')), dag=dag, client=mock_client, name='sql') with pytest.warns(UserWarning) as record: t.render() assert len(record) == 1 msg = ('It appears that your script will not create any tables/views but ' "the product parameter is " "SQLiteRelation(('my_table', 'table'))") assert record[0].message.args[0] == msg
name='red', params={'filename': 'winequality-red.csv'}) white_task = PythonCallable(get_data, product=File(tmp_dir / 'white.parquet'), dag=dag, name='white', params={'filename': 'winequality-white.csv'}) concat_task = PythonCallable(concat_data, product=File(tmp_dir / 'all.parquet'), dag=dag, name='all') upload_task = SQLUpload(tmp_dir / 'all.parquet', product=SQLiteRelation((None, 'data', 'table')), dag=dag, name='upload') ############################################################################### # you can use jinja2 to parametrize SQL, {{upstream}} and {{product}} # are available for your script. this way you could switch products without # changing your source code (e.g. each Data Scientist in your team writes # to his/her own db schema to have isolated runs) sql = """ CREATE TABLE {{product}} AS SELECT *, pH > AVG(pH) AS high_pH FROM {{upstream['upload']}} """
with pytest.raises(MissingClientError): prod.client @pytest.mark.parametrize( 'task_class, task_arg, product', [[SQLDump, 'SELECT * FROM my_table', File('data.csv')], [ SQLScript, 'CREATE TABLE {{product}} AS SELECT * FROM my_table', SQLRelation(['schema', 'name', 'table']) ], [ SQLTransfer, 'SELECT * FROM my_table', SQLiteRelation(['schema', 'name', 'table']) ], [ SQLUpload, 'SELECT * FROM my_table', SQLiteRelation(['schema', 'name', 'table']) ], [ PostgresCopyFrom, 'SELECT * FROM my_table', PostgresRelation(['schema', 'name', 'table']) ]]) def test_exception_if_missing_task_client(task_class, task_arg, product): task = task_class(task_arg, product, dag=DAG(), name='task') with pytest.raises(MissingClientError): task.client
def make_task_upload(env, dag): return SQLUpload('{{upstream["raw"]}}', product=SQLiteRelation((None, 'raw', 'table')), dag=dag, name='upload', to_sql_kwargs={'if_exists': 'replace'})
def make(tmp): """Make the dag """ tmp = Path(tmp) dag = DAG() # db with the source data client_source = SQLAlchemyClient('sqlite:///' + str(tmp / 'source.db')) # db where we'll insert processed data (can be the same as the # source db) client_target = SQLAlchemyClient('sqlite:///' + str(tmp / 'target.db')) dag.clients[SQLDump] = client_source dag.clients[SQLUpload] = client_target dag.clients[SQLiteRelation] = client_target cur = client_target.connection.execute(""" SELECT name FROM sqlite_master WHERE type='table' AND name='plus_one'""") if cur.fetchone(): cur = client_target.connection.execute('SELECT MAX(id) FROM plus_one') last_id = cur.fetchone()[0] else: last_id = None # we dump new observations to this file dumped_data = File(tmp / 'x.csv') # we add a hook that allows us to save info on the latest seen value dumped_data.prepare_metadata = add_last_value # the actual task that dumps data dump = SQLDump(""" SELECT * FROM data {% if last_id %} WHERE id > {{last_id}} {% endif %} """, dumped_data, dag=dag, name='dump', chunksize=None, params=dict(last_id=last_id)) # on finish hook, will stop DAG execution if there aren't new observations dump.on_finish = dump_on_finish # a dummy task to modify the data plus_one = PythonCallable(_plus_one, File(tmp / 'plus_one.csv'), dag=dag, name='plus_one') # upload the data to the target database upload = SQLUpload( '{{upstream["plus_one"]}}', product=SQLiteRelation((None, 'plus_one', 'table')), dag=dag, name='upload', # append observations if the table already exists to_sql_kwargs={ 'if_exists': 'append', 'index': False }) dump >> plus_one >> upload return dag
SELECT * FROM {{upstream["transfer"]}} WHERE x > 1 """)) ############################################################################### # DAG declaration dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[SQLTransfer] = client dag.clients[SQLiteRelation] = client dag.clients[SQLScript] = client source_loader = SourceLoader(tmp_dir) transfer = SQLTransfer(source_loader['data_select.sql'], product=SQLiteRelation((None, 'data2', 'table')), dag=dag, name='transfer') subset = SQLScript(source_loader['subset_create.sql'], product=SQLiteRelation((None, 'subset', 'table')), dag=dag, name='subset') transfer >> subset dag.render() ############################################################################### # Our macro is correctly rendered: