def create_data_block_from_sql( env: Environment, sql: str, sess: Session, db_api: DatabaseStorageApi, nominal_schema: Schema = None, inferred_schema: Schema = None, created_by_node_key: str = None, ) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]: # TODO: we are special casing sql right now, but could create another DataFormat (SqlQueryFormat, non-storable). # but, not sure how well it fits paradigm (it's a fundamentally non-python operation, the only one for now -- # if we had an R runtime or any other shell command, they would also be in this bucket) # fine here for now, but there is a generalization that might make the sql pipe less awkward (returning sdb) logger.debug("CREATING DATA BLOCK from sql") tmp_name = f"_tmp_{rand_str(10)}".lower() sql = db_api.clean_sub_sql(sql) create_sql = f""" create table {tmp_name} as select * from ( {sql} ) as __sub """ db_api.execute_sql(create_sql) cnt = db_api.count(tmp_name) if not nominal_schema: nominal_schema = env.get_schema("Any", sess) if not inferred_schema: inferred_schema = infer_schema_from_db_table(db_api, tmp_name) env.add_new_generated_schema(inferred_schema, sess) realized_schema = cast_to_realized_schema(env, sess, inferred_schema, nominal_schema) block = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key=inferred_schema.key if inferred_schema else None, nominal_schema_key=nominal_schema.key, realized_schema_key=realized_schema.key, record_count=cnt, created_by_node_key=created_by_node_key, ) storage_url = db_api.url sdb = StoredDataBlockMetadata( id=get_datablock_id(), data_block_id=block.id, data_block=block, storage_url=storage_url, data_format=DatabaseTableFormat, ) sess.add(block) sess.add(sdb) # sess.flush([block, sdb]) db_api.rename_table(tmp_name, sdb.get_name()) return block, sdb
def create_data_block_from_records( env: Environment, sess: Session, local_storage: Storage, records: Any, nominal_schema: Schema = None, inferred_schema: Schema = None, created_by_node_key: str = None, ) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]: from snapflow.storage.storage import LocalPythonStorageEngine logger.debug("CREATING DATA BLOCK") if isinstance(records, MemoryDataRecords): dro = records # Important: override nominal schema with DRO entry if it exists if dro.nominal_schema is not None: nominal_schema = env.get_schema(dro.nominal_schema, sess) else: dro = as_records(records, schema=nominal_schema) if not nominal_schema: nominal_schema = env.get_schema("Any", sess) if not inferred_schema: inferred_schema = dro.data_format.infer_schema_from_records( dro.records_object) env.add_new_generated_schema(inferred_schema, sess) realized_schema = cast_to_realized_schema(env, sess, inferred_schema, nominal_schema) dro = dro.conform_to_schema(realized_schema) block = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key=inferred_schema.key if inferred_schema else None, nominal_schema_key=nominal_schema.key, realized_schema_key=realized_schema.key, record_count=dro.record_count, created_by_node_key=created_by_node_key, ) sdb = StoredDataBlockMetadata( # type: ignore id=get_datablock_id(), data_block_id=block.id, data_block=block, storage_url=local_storage.url, data_format=dro.data_format, ) sess.add(block) sess.add(sdb) # sess.flush([block, sdb]) local_storage.get_api().put(sdb.get_name(), dro) return block, sdb