Пример #1
0
    def make():
        mock_client = Mock()

        dag = DAG()
        SQLDump('SELECT * FROM my_table',
                File('ok.txt'),
                dag,
                name='t1',
                client=mock_client)
        t2 = SQLDump('SELECT * FROM my_table',
                     File('{{unknown}}'),
                     dag,
                     name='t2',
                     client=mock_client)
        t3 = SQLDump('SELECT * FROM another',
                     File('another_file.txt'),
                     dag,
                     name='t3',
                     client=mock_client)
        t4 = SQLDump('SELECT * FROM something',
                     File('yet_another'),
                     dag,
                     name='t4',
                     client=mock_client)
        SQLDump('SELECT * FROM my_table_2',
                File('ok_2'),
                dag,
                name='t5',
                client=mock_client)
        t2 >> t3 >> t4
        return dag
Пример #2
0
def test_unsupported_extension():
    task = SQLDump('SELECT * FROM table',
                   File('my_file.json'),
                   DAG(),
                   name='task',
                   client=Mock())

    with pytest.raises(NotImplementedError):
        task.load()
Пример #3
0
def test_sqldump(product, kwargs, tmp_directory):
    df = pd.DataFrame({'a': [1, 2, 3]})
    df.to_csv('my_file.csv', index=False)
    task = SQLDump('SELECT * FROM table',
                   product,
                   DAG(),
                   name='task',
                   client=Mock())

    loaded = task.load(**kwargs)
    assert df.equals(loaded)
Пример #4
0
def test_sqldump_does_not_required_product_tag(tmp_directory):
    tmp = Path(tmp_directory)

    # create a db
    conn = connect(str(tmp / "database.db"))
    client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))
    # dump output path
    out = tmp / 'dump'

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn)

    # create the task and run it
    dag = DAG()

    # pass template SQL code so it's treated as a placeholder, this will force
    # the render step
    SQLDump('SELECT * FROM numbers LIMIT {{limit}}',
            File(out),
            dag,
            name='dump.csv',
            client=client,
            chunksize=None,
            io_handler=io.CSVIO,
            params={'limit': 10})

    dag.render()
Пример #5
0
def test_can_dump_sqlite_to_parquet(tmp_directory):
    tmp = Path(tmp_directory)

    # create a db
    conn = connect(str(tmp / "database.db"))
    client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))
    # dump output path
    out = tmp / 'dump'

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn)

    cur = conn.cursor()
    cur.execute('select * from numbers')

    # create the task and run it
    dag = DAG()
    SQLDump('SELECT * FROM numbers',
            File(out),
            dag,
            name='dump',
            client=client,
            chunksize=10,
            io_handler=io.ParquetIO)
    dag.build()

    # load dumped data and data from the db
    dump = pd.read_parquet(out)
    db = pd.read_sql_query('SELECT * FROM numbers', conn)

    conn.close()

    # make sure they are the same
    assert dump.equals(db)
Пример #6
0
def test_sqldump_with_dbapiclient(tmp_directory):
    client = DBAPIClient(connect, dict(database='my_db.db'))

    # make some data and save it in the db
    con_raw = connect(database='my_db.db')
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', con_raw)

    # create the task and run it
    dag = DAG()
    SQLDump('SELECT * FROM numbers',
            File('dump.csv'),
            dag,
            name='dump',
            client=client,
            chunksize=None,
            io_handler=io.CSVIO)

    dag.build()

    # load dumped data and data from the db
    dump = pd.read_csv('dump.csv')
    db = pd.read_sql_query('SELECT * FROM numbers', con_raw)

    client.close()
    con_raw.close()

    assert dump.equals(db)
Пример #7
0
def test_can_dump_postgres(tmp_directory, pg_client):
    tmp = Path(tmp_directory)

    # dump output path
    out = tmp / 'dump'

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', pg_client.engine, if_exists='replace')

    # create the task and run it
    dag = DAG()
    SQLDump('SELECT * FROM numbers',
            File(out),
            dag,
            name='dump',
            client=pg_client,
            chunksize=10,
            io_handler=io.ParquetIO)
    dag.build()

    # load dumped data and data from the db
    dump = pd.read_parquet(out)
    db = pd.read_sql_query('SELECT * FROM numbers', pg_client.engine)

    # make sure they are the same
    assert dump.equals(db)
Пример #8
0
def test_warn_on_sql_missing_docstrings():
    dag = DAG()

    sql = 'SELECT * FROM table'
    SQLDump(sql, File('file1.txt'), dag, client=Mock(), name='sql')

    with pytest.warns(UserWarning):
        dag.diagnose()
Пример #9
0
def test_tracebacks_are_shown_for_all_on_render_failing_tasks():
    dag = DAG()
    mock_client = Mock()
    SQLDump('SELECT * FROM {{one_table}}',
            File('one_table'),
            dag,
            name='t1',
            client=mock_client)
    SQLDump('SELECT * FROM {{another_table}}',
            File('another_table'),
            dag,
            name='t2',
            client=mock_client)

    with pytest.raises(DAGRenderError) as excinfo:
        dag.render()

    assert "SQLDump: t2 -> File('another_table')" in str(excinfo.value)
    assert "SQLDump: t1 -> File('one_table')" in str(excinfo.value)
Пример #10
0
def test_dump_io_handler(product_arg, expected_io_handler):

    dag = DAG()
    t = SQLDump('SELECT * FROM some_table',
                File(product_arg),
                dag,
                name='dump',
                client=Mock())

    assert expected_io_handler is t.io_handler
Пример #11
0
def test_does_not_warn_on_sql_docstrings():
    dag = DAG()

    sql = '/* get data from table */\nSELECT * FROM table'
    SQLDump(sql, File('file1.txt'), dag, client=Mock(), name='sql')

    with pytest.warns(None) as warn:
        dag.diagnose()

    assert not warn
Пример #12
0
def test_warn_on_sql_missing_docstrings():
    dag = DAG()

    sql = 'SELECT * FROM table'
    SQLDump(sql, File('file1.txt'), dag, client=Mock(), name='sql')

    qc = DAGQualityChecker()

    with pytest.warns(UserWarning):
        qc(dag)
Пример #13
0
def make_task(date_start, date_end, path, dag):
    """Task factory: returns a task that dumps certain year
    """
    sql = """
    SELECT * FROM data
    WHERE DATE('{{date_start}}') <= date AND date < DATE('{{date_end}}')
    """
    name = f'{date_start}-to-{date_end}.csv'
    return SQLDump(sql,
                   product=File(Path(path / name)),
                   dag=dag,
                   name=f'dump_{name}',
                   params={
                       'date_start': date_start,
                       'date_end': date_end
                   },
                   chunksize=None)
Пример #14
0
def test_sql_dump_shows_executed_code_if_fails(tmp_directory):
    tmp = Path(tmp_directory)

    client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    dag = DAG()

    SQLDump('SOME INVALID SQL',
            File('data.parquet'),
            dag,
            name='data',
            client=client)

    with pytest.raises(DAGBuildError) as excinfo:
        dag.build()

    assert 'SOME INVALID SQL' in str(excinfo.value)
    assert 'near "SOME": syntax error' in str(excinfo.value)
Пример #15
0
def make(tmp):
    """Make the dag
    """
    tmp = Path(tmp)
    dag = DAG()

    # db with the source data
    client_source = SQLAlchemyClient('sqlite:///' + str(tmp / 'source.db'))
    # db where we'll insert processed data (can be the same as the
    # source db)
    client_target = SQLAlchemyClient('sqlite:///' + str(tmp / 'target.db'))

    dag.clients[SQLDump] = client_source
    dag.clients[SQLUpload] = client_target
    dag.clients[SQLiteRelation] = client_target

    cur = client_target.connection.execute("""
    SELECT name FROM sqlite_master WHERE type='table' AND name='plus_one'""")

    if cur.fetchone():
        cur = client_target.connection.execute('SELECT MAX(id) FROM plus_one')
        last_id = cur.fetchone()[0]
    else:
        last_id = None

    # we dump new observations to this file
    dumped_data = File(tmp / 'x.csv')
    # we add a hook that allows us to save info on the latest seen value
    dumped_data.prepare_metadata = add_last_value

    # the actual task that dumps data
    dump = SQLDump("""
        SELECT * FROM data
        {% if last_id %}
        WHERE id > {{last_id}}
        {% endif %}
    """,
                   dumped_data,
                   dag=dag,
                   name='dump',
                   chunksize=None,
                   params=dict(last_id=last_id))

    # on finish hook, will stop DAG execution if there aren't new observations
    dump.on_finish = dump_on_finish

    # a dummy task to modify the data
    plus_one = PythonCallable(_plus_one,
                              File(tmp / 'plus_one.csv'),
                              dag=dag,
                              name='plus_one')

    # upload the data to the target database
    upload = SQLUpload(
        '{{upstream["plus_one"]}}',
        product=SQLiteRelation((None, 'plus_one', 'table')),
        dag=dag,
        name='upload',
        # append observations if the table already exists
        to_sql_kwargs={
            'if_exists': 'append',
            'index': False
        })

    dump >> plus_one >> upload

    return dag
Пример #16
0
def test_task_raises_error_if_name_cannot_be_infered():
    dag = DAG()

    with pytest.raises(AttributeError):
        SQLDump('SELECT * FROM my_table', File('/path/to/data'), dag)
Пример #17
0
def test_sql_runtime_params(tmp_directory):
    tmp = Path(tmp_directory)

    # create a db
    conn = connect(str(tmp / "database.db"))
    client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    # make some data and save it in the db
    df = pd.DataFrame({
        'number': range(10),
        'char': list(string.ascii_letters[:10])
    })
    df.to_sql('data', conn)

    # create the task and run it
    dag = DAG(executor=Serial(build_in_subprocess=False))

    t1 = SQLDump('SELECT * FROM data',
                 File('data.parquet'),
                 dag,
                 name='data',
                 client=client,
                 chunksize=None,
                 io_handler=io.ParquetIO)

    def select(product, upstream):
        numbers = list(pd.read_parquet(upstream['data']).number)
        numbers_selected = [n for n in numbers if n % 2 == 0]

        chars = list(pd.read_parquet(upstream['data']).char)
        chars_selected = [repr(c) for i, c in enumerate(chars) if i % 2 == 0]

        Path(product).write_text(
            json.dumps(dict(numbers=numbers_selected, chars=chars_selected)))

    t2 = PythonCallable(select, File('selected.json'), dag, name='selected')

    t3 = SQLDump("""
    SELECT * FROM data WHERE number
    NOT IN ([[get_key(upstream["selected"], "numbers") | join(", ") ]])
""",
                 File('numbers.parquet'),
                 dag,
                 name='numbers',
                 client=client,
                 chunksize=None,
                 io_handler=io.ParquetIO)

    t4 = SQLDump("""
    SELECT * FROM data WHERE char
    NOT IN ([[get_key(upstream["selected"], "chars") | join(", ") ]])
""",
                 File('chars.parquet'),
                 dag,
                 name='chars',
                 client=client,
                 chunksize=None,
                 io_handler=io.ParquetIO)

    t1 >> t2 >> t3
    t2 >> t4

    dag.build()

    assert list(pd.read_parquet('numbers.parquet').number) == [1, 3, 5, 7, 9]
    assert list(
        pd.read_parquet('chars.parquet').char) == ['b', 'd', 'f', 'h', 'j']
Пример #18
0
df.to_sql('example', engine)


###############################################################################
# There are three core concepts in ``ploomber``: :class:`Tasks <ploomber.tasks>`,
# :class:`Products <ploomber.products>` and :class:`DAGs <ploomber.DAG>`. Tasks
# are units of work that generate Products (which are persistent changes on
# disk). Tasks are organized into a DAG which keeps track of declared
# dependencies among them.

dag = DAG()

# the first task dumps data from the db to the local filesystem
task_dump = SQLDump('SELECT * FROM example',
                    File(tmp_dir / 'example.csv'),
                    dag,
                    name='dump',
                    client=SQLAlchemyClient(uri),
                    chunksize=None)


# since this task will have an upstream dependency, it has to accept the
# upstream parameter, all tasks must accept a product parameter
def _add_one(upstream, product):
    """Add one to column a
    """
    df = pd.read_csv(str(upstream['dump']))
    df['a'] = df['a'] + 1
    df.to_csv(str(product), index=False)

# we convert the Python function into a Task
task_add_one = PythonCallable(_add_one,