def create_data_block_from_records( env: Environment, sess: Session, local_storage: Storage, records: Any, nominal_schema: Schema = None, inferred_schema: Schema = None, created_by_node_key: str = None, ) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]: from snapflow.storage.storage import LocalPythonStorageEngine logger.debug("CREATING DATA BLOCK") if isinstance(records, MemoryDataRecords): dro = records # Important: override nominal schema with DRO entry if it exists if dro.nominal_schema is not None: nominal_schema = env.get_schema(dro.nominal_schema, sess) else: dro = as_records(records, schema=nominal_schema) if not nominal_schema: nominal_schema = env.get_schema("Any", sess) if not inferred_schema: inferred_schema = dro.data_format.infer_schema_from_records( dro.records_object) env.add_new_generated_schema(inferred_schema, sess) realized_schema = cast_to_realized_schema(env, sess, inferred_schema, nominal_schema) dro = dro.conform_to_schema(realized_schema) block = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key=inferred_schema.key if inferred_schema else None, nominal_schema_key=nominal_schema.key, realized_schema_key=realized_schema.key, record_count=dro.record_count, created_by_node_key=created_by_node_key, ) sdb = StoredDataBlockMetadata( # type: ignore id=get_datablock_id(), data_block_id=block.id, data_block=block, storage_url=local_storage.url, data_format=dro.data_format, ) sess.add(block) sess.add(sdb) # sess.flush([block, sdb]) local_storage.get_api().put(sdb.get_name(), dro) return block, sdb
def create_data_block_from_sql( env: Environment, sql: str, sess: Session, db_api: DatabaseStorageApi, nominal_schema: Schema = None, inferred_schema: Schema = None, created_by_node_key: str = None, ) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]: # TODO: we are special casing sql right now, but could create another DataFormat (SqlQueryFormat, non-storable). # but, not sure how well it fits paradigm (it's a fundamentally non-python operation, the only one for now -- # if we had an R runtime or any other shell command, they would also be in this bucket) # fine here for now, but there is a generalization that might make the sql pipe less awkward (returning sdb) logger.debug("CREATING DATA BLOCK from sql") tmp_name = f"_tmp_{rand_str(10)}".lower() sql = db_api.clean_sub_sql(sql) create_sql = f""" create table {tmp_name} as select * from ( {sql} ) as __sub """ db_api.execute_sql(create_sql) cnt = db_api.count(tmp_name) if not nominal_schema: nominal_schema = env.get_schema("Any", sess) if not inferred_schema: inferred_schema = infer_schema_from_db_table(db_api, tmp_name) env.add_new_generated_schema(inferred_schema, sess) realized_schema = cast_to_realized_schema(env, sess, inferred_schema, nominal_schema) block = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key=inferred_schema.key if inferred_schema else None, nominal_schema_key=nominal_schema.key, realized_schema_key=realized_schema.key, record_count=cnt, created_by_node_key=created_by_node_key, ) storage_url = db_api.url sdb = StoredDataBlockMetadata( id=get_datablock_id(), data_block_id=block.id, data_block=block, storage_url=storage_url, data_format=DatabaseTableFormat, ) sess.add(block) sess.add(sdb) # sess.flush([block, sdb]) db_api.rename_table(tmp_name, sdb.get_name()) return block, sdb
def resolve_nominal_output_schema(self, env: Environment) -> Optional[Schema]: if not self.output: return None if not self.output.is_generic: return env.get_schema(self.output.schema_like) output_generic = self.output.schema_like for input in self.inputs: if not input.declared_input.is_generic: continue if input.declared_input.schema_like == output_generic: schema = input.get_bound_nominal_schema() # We check if None -- there may be more than one input with same generic, we'll take any that are resolvable if schema is not None: return schema raise Exception(f"Unable to resolve generic '{output_generic}'")
def apply_schema_translation_as_sql(env: Environment, name: str, translation: SchemaTranslation) -> str: if not translation.from_schema_key: raise NotImplementedError( f"Schema translation must provide `from_schema` when translating a database table {translation}" ) sql = column_map( name, env.get_schema(translation.from_schema_key).field_names(), translation.as_dict(), ) table_stmt = f""" ( {sql} ) as __translated """ return table_stmt
def env_init(env: Environment): from . import _test_module # Test module / components with env.md_api.begin(): assert len(env.get_module_order()) == 1 env.add_module(_test_module) assert env.get_module_order() == [ env.get_local_module().name, _test_module.name, ] assert env.get_schema("TestSchema") is _test_module.schemas.TestSchema assert env.get_snap("test_sql") is _test_module.snaps.test_sql # Test runtime / storage env.add_storage("postgresql://test") assert len(env.storages) == 2 # added plus default local memory assert len(env.runtimes) == 2 # added plus default local python
def test_env_init(): from . import _test_module # Test module / components env = Environment("_test", metadata_storage="sqlite://", initial_modules=[]) with env.session_scope() as sess: assert len(env.get_module_order()) == 1 env.add_module(_test_module) assert env.get_module_order() == [ env.get_local_module().name, _test_module.name, ] assert env.get_schema("TestSchema", sess) is _test_module.schemas.TestSchema assert env.get_pipe("test_sql") is _test_module.pipes.test_sql # Test runtime / storage env.add_storage("postgresql://test") assert len(env.storages) == 2 # added plus default local memory assert len(env.runtimes) == 3 # added plus default local python # TODO
def inferred_schema(self, env: Environment) -> Optional[Schema]: return env.get_schema(self.inferred_schema_key)
def realized_schema(self, env: Environment) -> Schema: if self.data_block.realized_schema_key is None: return None return env.get_schema(self.data_block.realized_schema_key)
def nominal_schema(self, env: Environment) -> Optional[Schema]: if self.data_block.nominal_schema_key is None: return None return env.get_schema(self.data_block.nominal_schema_key)
def realized_schema(self, env: Environment, sess: Session) -> Schema: return env.get_schema(self.realized_schema_key, sess)
def nominal_schema(self, env: Environment, sess: Session) -> Optional[Schema]: return env.get_schema(self.nominal_schema_key, sess)
def inferred_schema(self, env: Environment, sess: Session) -> Optional[Schema]: return env.get_schema(self.inferred_schema_key, sess)
def schema(self, env: Environment, sess: Session) -> Schema: if self.is_generic: raise GenericSchemaException("Generic Schema has no name") return env.get_schema(self.schema_like, sess)
def inferred_schema(self, env: Environment, sess: Session) -> Optional[Schema]: if self.data_block.inferred_schema_key is None: return None return env.get_schema(self.data_block.inferred_schema_key, sess)
def get_schemas(self, env: Environment): return [env.get_schema(d) for d in self._filters.schema_keys]
def nominal_schema(self, env: Environment) -> Optional[Schema]: return env.get_schema(self.nominal_schema_key)
def realized_schema(self, env: Environment) -> Schema: return env.get_schema(self.realized_schema_key)
def get_schemas(self, env: Environment, sess: Session): dts = ensure_list(self.schemas) return [env.get_schema(d, sess) for d in dts]