def test_example(): env = Environment(metadata_storage="sqlite://") g = Graph(env) env.add_module(core) df = pd.DataFrame({"a": range(10), "b": range(10)}) g.create_node(key="n1", pipe="extract_dataframe", config={"dataframe": df}) output = env.produce("n1", g) assert_almost_equal(output.as_dataframe(), df)
def test_simple_import(): dburl = get_tmp_sqlite_db_url() env = Environment(metadata_storage=dburl) g = Graph(env) env.add_module(core) df = pd.DataFrame({"a": range(10), "b": range(10)}) g.create_node(key="n1", function="import_dataframe", params={"dataframe": df}) blocks = env.produce("n1", g) assert_almost_equal(blocks[0].as_dataframe(), df, check_dtype=False)
def create_data_block_from_sql( env: Environment, sql: str, sess: Session, db_api: DatabaseStorageApi, nominal_schema: Schema = None, inferred_schema: Schema = None, created_by_node_key: str = None, ) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]: # TODO: we are special casing sql right now, but could create another DataFormat (SqlQueryFormat, non-storable). # but, not sure how well it fits paradigm (it's a fundamentally non-python operation, the only one for now -- # if we had an R runtime or any other shell command, they would also be in this bucket) # fine here for now, but there is a generalization that might make the sql pipe less awkward (returning sdb) logger.debug("CREATING DATA BLOCK from sql") tmp_name = f"_tmp_{rand_str(10)}".lower() sql = db_api.clean_sub_sql(sql) create_sql = f""" create table {tmp_name} as select * from ( {sql} ) as __sub """ db_api.execute_sql(create_sql) cnt = db_api.count(tmp_name) if not nominal_schema: nominal_schema = env.get_schema("Any", sess) if not inferred_schema: inferred_schema = infer_schema_from_db_table(db_api, tmp_name) env.add_new_generated_schema(inferred_schema, sess) realized_schema = cast_to_realized_schema(env, sess, inferred_schema, nominal_schema) block = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key=inferred_schema.key if inferred_schema else None, nominal_schema_key=nominal_schema.key, realized_schema_key=realized_schema.key, record_count=cnt, created_by_node_key=created_by_node_key, ) storage_url = db_api.url sdb = StoredDataBlockMetadata( id=get_datablock_id(), data_block_id=block.id, data_block=block, storage_url=storage_url, data_format=DatabaseTableFormat, ) sess.add(block) sess.add(sdb) # sess.flush([block, sdb]) db_api.rename_table(tmp_name, sdb.get_name()) return block, sdb
def make_test_env(**kwargs) -> Environment: if "metadata_storage" not in kwargs: url = "sqlite://" metadata_storage = Storage.from_url(url) kwargs["metadata_storage"] = metadata_storage env = Environment(**kwargs) test_module = SnapflowModule( "_test", schemas=[TestSchema1, TestSchema2, TestSchema3, TestSchema4], ) env.add_module(test_module) return env
def test_default_module(): DEFAULT_LOCAL_MODULE.library.snaps = {} @Snap def s1(): pass assert len(DEFAULT_LOCAL_MODULE.library.snaps) == 1 assert DEFAULT_LOCAL_MODULE.get_snap("s1") is s1 env = Environment() env.add_snap(s1) assert env.get_snap("s1") is s1
def test_default_module(): DEFAULT_LOCAL_MODULE.library.functions = {} @datafunction def s1(): pass assert len(DEFAULT_LOCAL_MODULE.library.functions) == 1 assert DEFAULT_LOCAL_MODULE.get_function("s1") is s1 env = Environment() env.add_function(s1) assert env.get_function("s1") is s1
def make_test_env(**kwargs) -> Environment: if "metadata_storage" not in kwargs: url = get_tmp_sqlite_db_url() metadata_storage = Storage.from_url(url) kwargs["metadata_storage"] = metadata_storage env = Environment(settings=SnapflowSettings(abort_on_snap_error=True), **kwargs) test_module = SnapflowModule( "_test", schemas=[TestSchema1, TestSchema2, TestSchema3, TestSchema4], ) env.add_module(test_module) return env
def get_env(key="_test", db_url=None): if db_url is None: db_url = get_tmp_sqlite_db_url() env = Environment(key=key, metadata_storage=db_url) env.add_module(core) env.add_schema(Customer) env.add_schema(Metric) return env
def create_data_block_from_records( env: Environment, sess: Session, local_storage: Storage, records: Any, nominal_schema: Schema = None, inferred_schema: Schema = None, created_by_node_key: str = None, ) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]: from snapflow.storage.storage import LocalPythonStorageEngine logger.debug("CREATING DATA BLOCK") if isinstance(records, MemoryDataRecords): dro = records # Important: override nominal schema with DRO entry if it exists if dro.nominal_schema is not None: nominal_schema = env.get_schema(dro.nominal_schema, sess) else: dro = as_records(records, schema=nominal_schema) if not nominal_schema: nominal_schema = env.get_schema("Any", sess) if not inferred_schema: inferred_schema = dro.data_format.infer_schema_from_records( dro.records_object) env.add_new_generated_schema(inferred_schema, sess) realized_schema = cast_to_realized_schema(env, sess, inferred_schema, nominal_schema) dro = dro.conform_to_schema(realized_schema) block = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key=inferred_schema.key if inferred_schema else None, nominal_schema_key=nominal_schema.key, realized_schema_key=realized_schema.key, record_count=dro.record_count, created_by_node_key=created_by_node_key, ) sdb = StoredDataBlockMetadata( # type: ignore id=get_datablock_id(), data_block_id=block.id, data_block=block, storage_url=local_storage.url, data_format=dro.data_format, ) sess.add(block) sess.add(sdb) # sess.flush([block, sdb]) local_storage.get_api().put(sdb.get_name(), dro) return block, sdb
def logs(env: Environment): """Show log of Snaps on DataBlocks""" with env.get_metadata_api().begin(): query = env.md_api.execute(select(SnapLog).order_by(SnapLog.updated_at.desc())) drls = [] for dfl in query: if dfl.data_block_logs: for drl in dfl.data_block_logs: r = [ dfl.started_at.strftime("%F %T"), dfl.node_key, drl.direction.display, drl.data_block_id, ] drls.append(r) else: drls.append( [ dfl.started_at.strftime("%F %t"), f"{dfl.node_key} nothing to do", "-", "-", ] ) headers = [ "Started", "_Snap", "Direction", "DataBlock", ] echo_table(headers, drls)
def list_data_blocks(env: Environment): with env.get_metadata_api().begin(): query = env.md_api.execute( select(DataBlockMetadata) .filter(~DataBlockMetadata.deleted) .order_by(DataBlockMetadata.created_at) ) headers = [ "ID", "Nominal schema", "Created by node", "# Records", "Stored", ] rows = [ [ r.id, r.nominal_schema_key, r.created_by_node_key, r.record_count, r.stored_data_blocks.count(), ] for r in query ] echo_table(headers, rows)
def logs(env: Environment): """Show log of Pipes on DataBlocks""" with env.session_scope() as sess: query = sess.query(PipeLog).order_by(PipeLog.updated_at.desc()) drls = [] for dfl in query: if dfl.data_block_logs: for drl in dfl.data_block_logs: r = [ dfl.started_at.strftime("%F %T"), dfl.node_key, drl.direction.display, drl.data_block_id, ] drls.append(r) else: drls.append([ dfl.started_at.strftime("%F %t"), f"{dfl.node_key} nothing to do", "-", "-", ]) headers = [ "Started", "Pipe", "Direction", "DataBlock", ] echo_table(headers, drls)
def instantiate_node( env: Environment, graph: Graph, declared_node: DeclaredNode, ): if isinstance(declared_node.pipe, str): pipe = env.get_pipe(declared_node.pipe) else: pipe = make_pipe(declared_node.pipe) interface = pipe.get_interface() schema_translation = interface.assign_translations( declared_node.schema_translation) declared_inputs: Dict[str, DeclaredStreamInput] = {} if declared_node.upstream is not None: for name, stream_like in interface.assign_inputs( declared_node.upstream).items(): declared_inputs[name] = DeclaredStreamInput( stream=ensure_stream(stream_like), declared_schema_translation=(schema_translation or {}).get(name), ) n = Node( env=env, graph=graph, key=declared_node.key, pipe=pipe, config=declared_node.config, interface=interface, declared_inputs=declared_inputs, declared_schema_translation=schema_translation, output_alias=declared_node.output_alias, ) return n
def reset_metadata(env: Environment): """Reset metadata, all or selectively""" # TODO raise NotImplementedError with env.session_scope() as sess: sess.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}pipe_log cascade;" ) sess.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}pipe_log_id_seq cascade;" ) sess.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_log cascade;" ) sess.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_log_id_seq cascade;" ) sess.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_metadata cascade;" ) sess.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_set_metadata cascade;" ) sess.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}stored_data_resource_metadata cascade;" )
def instantiate_node( env: Environment, graph: Graph, declared_node: DeclaredNode, ): if isinstance(declared_node.snap, str): snap = env.get_snap(declared_node.snap) else: snap = make_snap(declared_node.snap) interface = snap.get_interface() schema_translations = interface.assign_translations( declared_node.schema_translations ) declared_inputs: Dict[str, DeclaredStreamInput] = {} if declared_node.inputs is not None: for name, stream_like in interface.assign_inputs(declared_node.inputs).items(): declared_inputs[name] = DeclaredStreamInput( stream=ensure_stream(stream_like), declared_schema_translation=(schema_translations or {}).get(name), ) n = Node( graph=graph, key=declared_node.key, snap=snap, params=declared_node.params, interface=interface, declared_inputs=declared_inputs, declared_schema_translation=schema_translations, output_alias=declared_node.output_alias, ) return n
def test_env_init(): env = Environment( f"_test_{rand_str()}", metadata_storage="sqlite://", settings=SnapflowSettings(add_core_module=False), ) env_init(env)
def reset_metadata(env: Environment): """Reset metadata, all or selectively""" # TODO raise NotImplementedError with env.get_metadata_api().begin(): env.md_api.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}snap_log cascade;" ) env.md_api.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}snap_log_id_seq cascade;" ) env.md_api.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_log cascade;" ) env.md_api.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_log_id_seq cascade;" ) env.md_api.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_metadata cascade;" ) env.md_api.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_set_metadata cascade;" ) env.md_api.execute( f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}stored_data_resource_metadata cascade;" )
def test_env_config(): cfg = EnvironmentConfiguration( f"_test_{rand_str()}", metadata_storage_url="sqlite://", settings=SnapflowSettings(add_core_module=False), ) env = Environment.from_config(cfg) env_init(env)
def env_init(env: Environment): from . import _test_module # Test module / components with env.md_api.begin(): assert len(env.get_module_order()) == 1 env.add_module(_test_module) assert env.get_module_order() == [ env.get_local_module().name, _test_module.name, ] assert env.get_schema("TestSchema") is _test_module.schemas.TestSchema assert env.get_snap("test_sql") is _test_module.snaps.test_sql # Test runtime / storage env.add_storage("postgresql://test") assert len(env.storages) == 2 # added plus default local memory assert len(env.runtimes) == 2 # added plus default local python
def app(ctx, debug: bool = False, metadata: Optional[str] = None): """Modern Data Pipelines""" logger.warning("The snapflow CLI is experimental and not officially supported yet") if debug: logger.add(sys.stderr, level="DEBUG") else: logger.add(sys.stderr, level="INFO") env = current_env() if env is None: env = Environment(metadata_storage=metadata) logger.info(f"Using environment '{env.metadata_storage.url}'") ctx.obj = env
def from_config(cfg: ExecutionConfiguration) -> ExecutionContext: env = Environment.from_config(cfg.env_config) return ExecutionContext( env=env, local_storage=ensure_storage(cfg.local_storage_url), target_storage=ensure_storage(cfg.target_storage_url), target_format=None, # TODO: from config storages=[ensure_storage(s) for s in cfg.storage_urls], # logger=ExecutionLogger(), # TODO: from config execution_timelimit_seconds=cfg.execution_timelimit_seconds, abort_on_snap_error=env.settings.abort_on_snap_error, execution_config=cfg, )
def test_multi_env(): db_url = get_tmp_sqlite_db_url() cfg = EnvironmentConfiguration( key=f"_test_{rand_str()}", metadata_storage_url=db_url, settings=SnapflowSettings(add_core_module=False), ) env1 = Environment.from_config(cfg) with env1.md_api.begin(): env1.md_api.add(DataBlockMetadata(realized_schema_key="Any")) env1.md_api.flush() assert env1.md_api.count(select(DataBlockMetadata)) == 1 cfg = EnvironmentConfiguration( key=f"_test_{rand_str()}", metadata_storage_url=db_url, settings=SnapflowSettings(add_core_module=False), ) env2 = Environment.from_config(cfg) with env2.md_api.begin(): assert env2.md_api.count(select(DataBlockMetadata)) == 0 env2.md_api.add(DataBlockMetadata(realized_schema_key="Any")) env2.md_api.flush() assert env2.md_api.count(select(DataBlockMetadata)) == 1
def run(env: Environment, node: str = None, deps: bool = False): """Run snapflow pipeline""" if node: if deps: env.produce(node) else: env.run_node(node) else: raise NotImplementedError env.run_graph()
def list_nodes(env: Environment): with env.session_scope() as sess: query = (sess.query( PipeLog.node_key, func.count(PipeLog.id), func.max(PipeLog.started_at), func.count(DataBlockLog.id), ).join(PipeLog.data_block_logs).group_by(PipeLog.node_key).all()) headers = [ "Node key", "Run count", "Last run at", "block count", ] rows = [(k, c, m.strftime("%F %T")) for k, c, m in query] echo_table(headers, rows)
def resolve_nominal_output_schema(self, env: Environment) -> Optional[Schema]: if not self.output: return None if not self.output.is_generic: return env.get_schema(self.output.schema_like) output_generic = self.output.schema_like for input in self.inputs: if not input.declared_input.is_generic: continue if input.declared_input.schema_like == output_generic: schema = input.get_bound_nominal_schema() # We check if None -- there may be more than one input with same generic, we'll take any that are resolvable if schema is not None: return schema raise Exception(f"Unable to resolve generic '{output_generic}'")
def apply_schema_translation_as_sql(env: Environment, name: str, translation: SchemaTranslation) -> str: if not translation.from_schema_key: raise NotImplementedError( f"Schema translation must provide `from_schema` when translating a database table {translation}" ) sql = column_map( name, env.get_schema(translation.from_schema_key).field_names(), translation.as_dict(), ) table_stmt = f""" ( {sql} ) as __translated """ return table_stmt
def list_data_blocks(env: Environment): with env.session_scope() as sess: query = (sess.query(DataBlockMetadata).filter( ~DataBlockMetadata.deleted).order_by(DataBlockMetadata.created_at)) headers = [ "ID", "Nominal schema", "Created by node", "# Records", "Stored", ] rows = [[ r.id, r.nominal_schema_key, r.created_by_node_key, r.record_count, r.stored_data_blocks.count(), ] for r in query] echo_table(headers, rows)
def list_nodes(env: Environment): with env.get_metadata_api().begin(): query = env.md_api.execute( select( SnapLog.node_key, func.count(SnapLog.id), func.max(SnapLog.started_at), func.count(DataBlockLog.id), ) .join(SnapLog.data_block_logs) .group_by(SnapLog.node_key) ).all() headers = [ "Node key", "Run count", "Last run at", "block count", ] rows = [(k, c, m.strftime("%F %T")) for k, c, m in query] echo_table(headers, rows)
def realized_schema(self, env: Environment) -> Schema: return env.get_schema(self.realized_schema_key)
def nominal_schema(self, env: Environment) -> Optional[Schema]: return env.get_schema(self.nominal_schema_key)