Exemplo n.º 1
0
def create_data_block_from_records(
    env: Environment,
    sess: Session,
    local_storage: Storage,
    records: Any,
    nominal_schema: Schema = None,
    inferred_schema: Schema = None,
    created_by_node_key: str = None,
) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]:
    from snapflow.storage.storage import LocalPythonStorageEngine

    logger.debug("CREATING DATA BLOCK")
    if isinstance(records, MemoryDataRecords):
        dro = records
        # Important: override nominal schema with DRO entry if it exists
        if dro.nominal_schema is not None:
            nominal_schema = env.get_schema(dro.nominal_schema, sess)
    else:
        dro = as_records(records, schema=nominal_schema)
    if not nominal_schema:
        nominal_schema = env.get_schema("Any", sess)
    if not inferred_schema:
        inferred_schema = dro.data_format.infer_schema_from_records(
            dro.records_object)
        env.add_new_generated_schema(inferred_schema, sess)
    realized_schema = cast_to_realized_schema(env, sess, inferred_schema,
                                              nominal_schema)
    dro = dro.conform_to_schema(realized_schema)
    block = DataBlockMetadata(
        id=get_datablock_id(),
        inferred_schema_key=inferred_schema.key if inferred_schema else None,
        nominal_schema_key=nominal_schema.key,
        realized_schema_key=realized_schema.key,
        record_count=dro.record_count,
        created_by_node_key=created_by_node_key,
    )
    sdb = StoredDataBlockMetadata(  # type: ignore
        id=get_datablock_id(),
        data_block_id=block.id,
        data_block=block,
        storage_url=local_storage.url,
        data_format=dro.data_format,
    )
    sess.add(block)
    sess.add(sdb)
    # sess.flush([block, sdb])
    local_storage.get_api().put(sdb.get_name(), dro)
    return block, sdb
Exemplo n.º 2
0
def create_data_block_from_sql(
    env: Environment,
    sql: str,
    sess: Session,
    db_api: DatabaseStorageApi,
    nominal_schema: Schema = None,
    inferred_schema: Schema = None,
    created_by_node_key: str = None,
) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]:
    # TODO: we are special casing sql right now, but could create another DataFormat (SqlQueryFormat, non-storable).
    #       but, not sure how well it fits paradigm (it's a fundamentally non-python operation, the only one for now --
    #       if we had an R runtime or any other shell command, they would also be in this bucket)
    #       fine here for now, but there is a generalization that might make the sql pipe less awkward (returning sdb)
    logger.debug("CREATING DATA BLOCK from sql")
    tmp_name = f"_tmp_{rand_str(10)}".lower()
    sql = db_api.clean_sub_sql(sql)
    create_sql = f"""
    create table {tmp_name} as
    select
    *
    from (
    {sql}
    ) as __sub
    """
    db_api.execute_sql(create_sql)
    cnt = db_api.count(tmp_name)
    if not nominal_schema:
        nominal_schema = env.get_schema("Any", sess)
    if not inferred_schema:
        inferred_schema = infer_schema_from_db_table(db_api, tmp_name)
        env.add_new_generated_schema(inferred_schema, sess)
    realized_schema = cast_to_realized_schema(env, sess, inferred_schema,
                                              nominal_schema)
    block = DataBlockMetadata(
        id=get_datablock_id(),
        inferred_schema_key=inferred_schema.key if inferred_schema else None,
        nominal_schema_key=nominal_schema.key,
        realized_schema_key=realized_schema.key,
        record_count=cnt,
        created_by_node_key=created_by_node_key,
    )
    storage_url = db_api.url
    sdb = StoredDataBlockMetadata(
        id=get_datablock_id(),
        data_block_id=block.id,
        data_block=block,
        storage_url=storage_url,
        data_format=DatabaseTableFormat,
    )
    sess.add(block)
    sess.add(sdb)
    # sess.flush([block, sdb])
    db_api.rename_table(tmp_name, sdb.get_name())
    return block, sdb
Exemplo n.º 3
0
 def resolve_nominal_output_schema(self,
                                   env: Environment) -> Optional[Schema]:
     if not self.output:
         return None
     if not self.output.is_generic:
         return env.get_schema(self.output.schema_like)
     output_generic = self.output.schema_like
     for input in self.inputs:
         if not input.declared_input.is_generic:
             continue
         if input.declared_input.schema_like == output_generic:
             schema = input.get_bound_nominal_schema()
             # We check if None -- there may be more than one input with same generic, we'll take any that are resolvable
             if schema is not None:
                 return schema
     raise Exception(f"Unable to resolve generic '{output_generic}'")
Exemplo n.º 4
0
def apply_schema_translation_as_sql(env: Environment, name: str,
                                    translation: SchemaTranslation) -> str:
    if not translation.from_schema_key:
        raise NotImplementedError(
            f"Schema translation must provide `from_schema` when translating a database table {translation}"
        )
    sql = column_map(
        name,
        env.get_schema(translation.from_schema_key).field_names(),
        translation.as_dict(),
    )
    table_stmt = f"""
        (
            {sql}
        ) as __translated
        """
    return table_stmt
Exemplo n.º 5
0
def env_init(env: Environment):
    from . import _test_module

    # Test module / components
    with env.md_api.begin():
        assert len(env.get_module_order()) == 1
        env.add_module(_test_module)
        assert env.get_module_order() == [
            env.get_local_module().name,
            _test_module.name,
        ]
        assert env.get_schema("TestSchema") is _test_module.schemas.TestSchema
        assert env.get_snap("test_sql") is _test_module.snaps.test_sql
        # Test runtime / storage
        env.add_storage("postgresql://test")
        assert len(env.storages) == 2  # added plus default local memory
        assert len(env.runtimes) == 2  # added plus default local python
Exemplo n.º 6
0
def test_env_init():
    from . import _test_module

    # Test module / components
    env = Environment("_test",
                      metadata_storage="sqlite://",
                      initial_modules=[])
    with env.session_scope() as sess:
        assert len(env.get_module_order()) == 1
        env.add_module(_test_module)
        assert env.get_module_order() == [
            env.get_local_module().name,
            _test_module.name,
        ]
        assert env.get_schema("TestSchema",
                              sess) is _test_module.schemas.TestSchema
        assert env.get_pipe("test_sql") is _test_module.pipes.test_sql
        # Test runtime / storage
        env.add_storage("postgresql://test")
        assert len(env.storages) == 2  # added plus default local memory
        assert len(env.runtimes) == 3  # added plus default local python # TODO
Exemplo n.º 7
0
 def inferred_schema(self, env: Environment) -> Optional[Schema]:
     return env.get_schema(self.inferred_schema_key)
Exemplo n.º 8
0
 def realized_schema(self, env: Environment) -> Schema:
     if self.data_block.realized_schema_key is None:
         return None
     return env.get_schema(self.data_block.realized_schema_key)
Exemplo n.º 9
0
 def nominal_schema(self, env: Environment) -> Optional[Schema]:
     if self.data_block.nominal_schema_key is None:
         return None
     return env.get_schema(self.data_block.nominal_schema_key)
Exemplo n.º 10
0
 def realized_schema(self, env: Environment, sess: Session) -> Schema:
     return env.get_schema(self.realized_schema_key, sess)
Exemplo n.º 11
0
 def nominal_schema(self, env: Environment,
                    sess: Session) -> Optional[Schema]:
     return env.get_schema(self.nominal_schema_key, sess)
Exemplo n.º 12
0
 def inferred_schema(self, env: Environment,
                     sess: Session) -> Optional[Schema]:
     return env.get_schema(self.inferred_schema_key, sess)
Exemplo n.º 13
0
 def schema(self, env: Environment, sess: Session) -> Schema:
     if self.is_generic:
         raise GenericSchemaException("Generic Schema has no name")
     return env.get_schema(self.schema_like, sess)
Exemplo n.º 14
0
 def inferred_schema(self, env: Environment,
                     sess: Session) -> Optional[Schema]:
     if self.data_block.inferred_schema_key is None:
         return None
     return env.get_schema(self.data_block.inferred_schema_key, sess)
Exemplo n.º 15
0
 def get_schemas(self, env: Environment):
     return [env.get_schema(d) for d in self._filters.schema_keys]
Exemplo n.º 16
0
 def nominal_schema(self, env: Environment) -> Optional[Schema]:
     return env.get_schema(self.nominal_schema_key)
Exemplo n.º 17
0
 def realized_schema(self, env: Environment) -> Schema:
     return env.get_schema(self.realized_schema_key)
Exemplo n.º 18
0
 def get_schemas(self, env: Environment, sess: Session):
     dts = ensure_list(self.schemas)
     return [env.get_schema(d, sess) for d in dts]