def make(): mock_client = Mock() dag = DAG() SQLDump('SELECT * FROM my_table', File('ok.txt'), dag, name='t1', client=mock_client) t2 = SQLDump('SELECT * FROM my_table', File('{{unknown}}'), dag, name='t2', client=mock_client) t3 = SQLDump('SELECT * FROM another', File('another_file.txt'), dag, name='t3', client=mock_client) t4 = SQLDump('SELECT * FROM something', File('yet_another'), dag, name='t4', client=mock_client) SQLDump('SELECT * FROM my_table_2', File('ok_2'), dag, name='t5', client=mock_client) t2 >> t3 >> t4 return dag
def test_unsupported_extension(): task = SQLDump('SELECT * FROM table', File('my_file.json'), DAG(), name='task', client=Mock()) with pytest.raises(NotImplementedError): task.load()
def test_sqldump(product, kwargs, tmp_directory): df = pd.DataFrame({'a': [1, 2, 3]}) df.to_csv('my_file.csv', index=False) task = SQLDump('SELECT * FROM table', product, DAG(), name='task', client=Mock()) loaded = task.load(**kwargs) assert df.equals(loaded)
def test_sqldump_does_not_required_product_tag(tmp_directory): tmp = Path(tmp_directory) # create a db conn = connect(str(tmp / "database.db")) client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db")) # dump output path out = tmp / 'dump' # make some data and save it in the db df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', conn) # create the task and run it dag = DAG() # pass template SQL code so it's treated as a placeholder, this will force # the render step SQLDump('SELECT * FROM numbers LIMIT {{limit}}', File(out), dag, name='dump.csv', client=client, chunksize=None, io_handler=io.CSVIO, params={'limit': 10}) dag.render()
def test_can_dump_sqlite_to_parquet(tmp_directory): tmp = Path(tmp_directory) # create a db conn = connect(str(tmp / "database.db")) client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db")) # dump output path out = tmp / 'dump' # make some data and save it in the db df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', conn) cur = conn.cursor() cur.execute('select * from numbers') # create the task and run it dag = DAG() SQLDump('SELECT * FROM numbers', File(out), dag, name='dump', client=client, chunksize=10, io_handler=io.ParquetIO) dag.build() # load dumped data and data from the db dump = pd.read_parquet(out) db = pd.read_sql_query('SELECT * FROM numbers', conn) conn.close() # make sure they are the same assert dump.equals(db)
def test_sqldump_with_dbapiclient(tmp_directory): client = DBAPIClient(connect, dict(database='my_db.db')) # make some data and save it in the db con_raw = connect(database='my_db.db') df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', con_raw) # create the task and run it dag = DAG() SQLDump('SELECT * FROM numbers', File('dump.csv'), dag, name='dump', client=client, chunksize=None, io_handler=io.CSVIO) dag.build() # load dumped data and data from the db dump = pd.read_csv('dump.csv') db = pd.read_sql_query('SELECT * FROM numbers', con_raw) client.close() con_raw.close() assert dump.equals(db)
def test_can_dump_postgres(tmp_directory, pg_client): tmp = Path(tmp_directory) # dump output path out = tmp / 'dump' # make some data and save it in the db df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', pg_client.engine, if_exists='replace') # create the task and run it dag = DAG() SQLDump('SELECT * FROM numbers', File(out), dag, name='dump', client=pg_client, chunksize=10, io_handler=io.ParquetIO) dag.build() # load dumped data and data from the db dump = pd.read_parquet(out) db = pd.read_sql_query('SELECT * FROM numbers', pg_client.engine) # make sure they are the same assert dump.equals(db)
def test_warn_on_sql_missing_docstrings(): dag = DAG() sql = 'SELECT * FROM table' SQLDump(sql, File('file1.txt'), dag, client=Mock(), name='sql') with pytest.warns(UserWarning): dag.diagnose()
def test_tracebacks_are_shown_for_all_on_render_failing_tasks(): dag = DAG() mock_client = Mock() SQLDump('SELECT * FROM {{one_table}}', File('one_table'), dag, name='t1', client=mock_client) SQLDump('SELECT * FROM {{another_table}}', File('another_table'), dag, name='t2', client=mock_client) with pytest.raises(DAGRenderError) as excinfo: dag.render() assert "SQLDump: t2 -> File('another_table')" in str(excinfo.value) assert "SQLDump: t1 -> File('one_table')" in str(excinfo.value)
def test_dump_io_handler(product_arg, expected_io_handler): dag = DAG() t = SQLDump('SELECT * FROM some_table', File(product_arg), dag, name='dump', client=Mock()) assert expected_io_handler is t.io_handler
def test_does_not_warn_on_sql_docstrings(): dag = DAG() sql = '/* get data from table */\nSELECT * FROM table' SQLDump(sql, File('file1.txt'), dag, client=Mock(), name='sql') with pytest.warns(None) as warn: dag.diagnose() assert not warn
def test_warn_on_sql_missing_docstrings(): dag = DAG() sql = 'SELECT * FROM table' SQLDump(sql, File('file1.txt'), dag, client=Mock(), name='sql') qc = DAGQualityChecker() with pytest.warns(UserWarning): qc(dag)
def make_task(date_start, date_end, path, dag): """Task factory: returns a task that dumps certain year """ sql = """ SELECT * FROM data WHERE DATE('{{date_start}}') <= date AND date < DATE('{{date_end}}') """ name = f'{date_start}-to-{date_end}.csv' return SQLDump(sql, product=File(Path(path / name)), dag=dag, name=f'dump_{name}', params={ 'date_start': date_start, 'date_end': date_end }, chunksize=None)
def test_sql_dump_shows_executed_code_if_fails(tmp_directory): tmp = Path(tmp_directory) client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db")) dag = DAG() SQLDump('SOME INVALID SQL', File('data.parquet'), dag, name='data', client=client) with pytest.raises(DAGBuildError) as excinfo: dag.build() assert 'SOME INVALID SQL' in str(excinfo.value) assert 'near "SOME": syntax error' in str(excinfo.value)
def make(tmp): """Make the dag """ tmp = Path(tmp) dag = DAG() # db with the source data client_source = SQLAlchemyClient('sqlite:///' + str(tmp / 'source.db')) # db where we'll insert processed data (can be the same as the # source db) client_target = SQLAlchemyClient('sqlite:///' + str(tmp / 'target.db')) dag.clients[SQLDump] = client_source dag.clients[SQLUpload] = client_target dag.clients[SQLiteRelation] = client_target cur = client_target.connection.execute(""" SELECT name FROM sqlite_master WHERE type='table' AND name='plus_one'""") if cur.fetchone(): cur = client_target.connection.execute('SELECT MAX(id) FROM plus_one') last_id = cur.fetchone()[0] else: last_id = None # we dump new observations to this file dumped_data = File(tmp / 'x.csv') # we add a hook that allows us to save info on the latest seen value dumped_data.prepare_metadata = add_last_value # the actual task that dumps data dump = SQLDump(""" SELECT * FROM data {% if last_id %} WHERE id > {{last_id}} {% endif %} """, dumped_data, dag=dag, name='dump', chunksize=None, params=dict(last_id=last_id)) # on finish hook, will stop DAG execution if there aren't new observations dump.on_finish = dump_on_finish # a dummy task to modify the data plus_one = PythonCallable(_plus_one, File(tmp / 'plus_one.csv'), dag=dag, name='plus_one') # upload the data to the target database upload = SQLUpload( '{{upstream["plus_one"]}}', product=SQLiteRelation((None, 'plus_one', 'table')), dag=dag, name='upload', # append observations if the table already exists to_sql_kwargs={ 'if_exists': 'append', 'index': False }) dump >> plus_one >> upload return dag
def test_task_raises_error_if_name_cannot_be_infered(): dag = DAG() with pytest.raises(AttributeError): SQLDump('SELECT * FROM my_table', File('/path/to/data'), dag)
def test_sql_runtime_params(tmp_directory): tmp = Path(tmp_directory) # create a db conn = connect(str(tmp / "database.db")) client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db")) # make some data and save it in the db df = pd.DataFrame({ 'number': range(10), 'char': list(string.ascii_letters[:10]) }) df.to_sql('data', conn) # create the task and run it dag = DAG(executor=Serial(build_in_subprocess=False)) t1 = SQLDump('SELECT * FROM data', File('data.parquet'), dag, name='data', client=client, chunksize=None, io_handler=io.ParquetIO) def select(product, upstream): numbers = list(pd.read_parquet(upstream['data']).number) numbers_selected = [n for n in numbers if n % 2 == 0] chars = list(pd.read_parquet(upstream['data']).char) chars_selected = [repr(c) for i, c in enumerate(chars) if i % 2 == 0] Path(product).write_text( json.dumps(dict(numbers=numbers_selected, chars=chars_selected))) t2 = PythonCallable(select, File('selected.json'), dag, name='selected') t3 = SQLDump(""" SELECT * FROM data WHERE number NOT IN ([[get_key(upstream["selected"], "numbers") | join(", ") ]]) """, File('numbers.parquet'), dag, name='numbers', client=client, chunksize=None, io_handler=io.ParquetIO) t4 = SQLDump(""" SELECT * FROM data WHERE char NOT IN ([[get_key(upstream["selected"], "chars") | join(", ") ]]) """, File('chars.parquet'), dag, name='chars', client=client, chunksize=None, io_handler=io.ParquetIO) t1 >> t2 >> t3 t2 >> t4 dag.build() assert list(pd.read_parquet('numbers.parquet').number) == [1, 3, 5, 7, 9] assert list( pd.read_parquet('chars.parquet').char) == ['b', 'd', 'f', 'h', 'j']
df.to_sql('example', engine) ############################################################################### # There are three core concepts in ``ploomber``: :class:`Tasks <ploomber.tasks>`, # :class:`Products <ploomber.products>` and :class:`DAGs <ploomber.DAG>`. Tasks # are units of work that generate Products (which are persistent changes on # disk). Tasks are organized into a DAG which keeps track of declared # dependencies among them. dag = DAG() # the first task dumps data from the db to the local filesystem task_dump = SQLDump('SELECT * FROM example', File(tmp_dir / 'example.csv'), dag, name='dump', client=SQLAlchemyClient(uri), chunksize=None) # since this task will have an upstream dependency, it has to accept the # upstream parameter, all tasks must accept a product parameter def _add_one(upstream, product): """Add one to column a """ df = pd.read_csv(str(upstream['dump'])) df['a'] = df['a'] + 1 df.to_csv(str(product), index=False) # we convert the Python function into a Task task_add_one = PythonCallable(_add_one,