def test_custom_io_handler(tmp_directory): dag = DAG() client = SQLAlchemyClient('sqlite:///database.db') dag.clients[SQLUpload] = client dag.clients[SQLiteRelation] = client df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]}) df.to_csv('some-file.tsv', sep='\t', index=False) def my_reading_fn(path): return pd.read_csv(path, sep='\t') SQLUpload('some-file.tsv', SQLiteRelation(('my-table', 'table')), dag=dag, name='task', io_handler=my_reading_fn, to_sql_kwargs=dict(index=False)) dag.build() other = pd.read_sql('SELECT * FROM "my-table"', con=client) client.close() assert other.equals(df)
def test_append_rows(tmp_directory, pg_client_and_schema): pg_client, schema = pg_client_and_schema df = pd.DataFrame({'a': [1, 2, 3]}) df.to_csv('data.csv', index=False) dag = DAG() dag.clients[SQLUpload] = pg_client dag.clients[PostgresRelation] = pg_client # create table df.to_sql('test_append', pg_client.engine, schema=schema, if_exists='replace', index=False) SQLUpload('data.csv', product=PostgresRelation((schema, 'test_append', 'table')), dag=dag, name='upload', to_sql_kwargs={ 'if_exists': 'append', 'index': False }) dag.build() df = pd.read_sql('SELECT * FROM {}.test_append'.format(schema), pg_client.engine) assert df.shape[0] == 6
def test_can_upload_file_from_upstream_dependency(tmp_directory, pg_client_and_schema): pg_client, schema = pg_client_and_schema dag = DAG() dag.clients[SQLUpload] = pg_client dag.clients[PostgresRelation] = pg_client make = PythonCallable(make_data, product=File('data.parquet'), dag=dag, name='make') name = 'test_can_upload_file_from_upstream_dependency' pg = SQLUpload('{{upstream["make"]}}', product=PostgresRelation((schema, name, 'table')), dag=dag, name='upload', to_sql_kwargs={'if_exists': 'replace'}) make >> pg dag.build()
def test_can_upload_a_file_using_a_path(tmp_directory, pg_client_and_schema): pg_client, schema = pg_client_and_schema df = pd.DataFrame({'a': [1, 2, 3]}) df.to_parquet('data.parquet') dag = DAG() dag.clients[PostgresRelation] = pg_client dag.clients[SQLUpload] = pg_client SQLUpload(Path('data.parquet'), product=PostgresRelation( (schema, 'test_can_upload_a_file', 'table')), dag=dag, name='upload') dag.build()
def test_can_upload_a_file(serializer, task_arg, tmp_directory, pg_client_and_schema): pg_client, schema = pg_client_and_schema df = pd.DataFrame({'a': [1, 2, 3]}) getattr(df, serializer)(task_arg) dag = DAG() dag.clients[SQLUpload] = pg_client dag.clients[PostgresRelation] = pg_client SQLUpload(task_arg, product=PostgresRelation( (schema, 'test_can_upload_a_file', 'table')), dag=dag, name='upload', to_sql_kwargs={'if_exists': 'replace'}) dag.build()
def test_upload_a_file_with_generic_relation(serializer, task_arg, sqlite_client_and_tmp_dir, pg_client_and_schema): client, _ = sqlite_client_and_tmp_dir pg_client, schema = pg_client_and_schema df = pd.DataFrame({'a': [1, 2, 3]}) getattr(df, serializer)(task_arg) dag = DAG() dag.clients[SQLUpload] = pg_client dag.clients[GenericSQLRelation] = client SQLUpload(task_arg, product=GenericSQLRelation( (schema, 'test_can_upload_a_file', 'table')), dag=dag, name='upload', to_sql_kwargs={'if_exists': 'replace'}) dag.build()
name='red', params={'filename': 'winequality-red.csv'}) white_task = PythonCallable(get_data, product=File(tmp_dir / 'white.parquet'), dag=dag, name='white', params={'filename': 'winequality-white.csv'}) concat_task = PythonCallable(concat_data, product=File(tmp_dir / 'all.parquet'), dag=dag, name='all') upload_task = SQLUpload(tmp_dir / 'all.parquet', product=SQLiteRelation((None, 'data', 'table')), dag=dag, name='upload') ############################################################################### # you can use jinja2 to parametrize SQL, {{upstream}} and {{product}} # are available for your script. this way you could switch products without # changing your source code (e.g. each Data Scientist in your team writes # to his/her own db schema to have isolated runs) sql = """ CREATE TABLE {{product}} AS SELECT *, pH > AVG(pH) AS high_pH FROM {{upstream['upload']}} """
def make_task_upload(env, dag): return SQLUpload('{{upstream["raw"]}}', product=SQLiteRelation((None, 'raw', 'table')), dag=dag, name='upload', to_sql_kwargs={'if_exists': 'replace'})
def make(tmp): """Make the dag """ tmp = Path(tmp) dag = DAG() # db with the source data client_source = SQLAlchemyClient('sqlite:///' + str(tmp / 'source.db')) # db where we'll insert processed data (can be the same as the # source db) client_target = SQLAlchemyClient('sqlite:///' + str(tmp / 'target.db')) dag.clients[SQLDump] = client_source dag.clients[SQLUpload] = client_target dag.clients[SQLiteRelation] = client_target cur = client_target.connection.execute(""" SELECT name FROM sqlite_master WHERE type='table' AND name='plus_one'""") if cur.fetchone(): cur = client_target.connection.execute('SELECT MAX(id) FROM plus_one') last_id = cur.fetchone()[0] else: last_id = None # we dump new observations to this file dumped_data = File(tmp / 'x.csv') # we add a hook that allows us to save info on the latest seen value dumped_data.prepare_metadata = add_last_value # the actual task that dumps data dump = SQLDump(""" SELECT * FROM data {% if last_id %} WHERE id > {{last_id}} {% endif %} """, dumped_data, dag=dag, name='dump', chunksize=None, params=dict(last_id=last_id)) # on finish hook, will stop DAG execution if there aren't new observations dump.on_finish = dump_on_finish # a dummy task to modify the data plus_one = PythonCallable(_plus_one, File(tmp / 'plus_one.csv'), dag=dag, name='plus_one') # upload the data to the target database upload = SQLUpload( '{{upstream["plus_one"]}}', product=SQLiteRelation((None, 'plus_one', 'table')), dag=dag, name='upload', # append observations if the table already exists to_sql_kwargs={ 'if_exists': 'append', 'index': False }) dump >> plus_one >> upload return dag
df['label'] = wine.target df.to_parquet(str(product)) dag = DAG() dag.clients[SQLUpload] = client dag.clients[SQLiteRelation] = client dag.clients[SQLScript] = client get_data = PythonCallable(_get_data, product=File(tmp_dir / 'wine.parquet'), dag=dag, name='get') upload = SQLUpload('{{upstream["get"]}}', product=SQLiteRelation((None, 'wine', 'table')), dag=dag, name='upload') ############################################################################### # In a real project, your SQL scripts should be separate files, we include # this here to make this example standalone. SQL is a language that people # from a lot of backgrounds understand, you could easily communicate your # analysis with business analysts to make your your data assumptions are # correct _clean = """ /* Cleaning dataset, we decided to ignore rows where magnesium is over 100 since we believe the data is corrupted */ CREATE TABLE {{product}} AS
def make_task_upload(dag, env): return SQLUpload('{{upstream["raw"]}}', product=SQLiteRelation((None, 'raw', 'table')), dag=dag, name='upload')