def make_copy_request( from_url: str, to_url: str, fmt: str = None, schema: str = None ) -> CopyRequest: from_split = from_url.split("/") to_split = to_url.split("/") from_name = from_split[-1] to_name = to_split[-1] from_storage_url = "/".join(from_split[:-1]) to_storage_url = "/".join(to_split[:-1]) to_storage = Storage(to_storage_url) if fmt: to_fmt = get_format_for_nickname(fmt) else: to_fmt = to_storage.storage_engine.get_natural_format() if not from_storage_url: # No storage url then default to local file pth = os.getcwd() from_storage_url = f"file://{pth}" return CopyRequest( from_name=from_name, from_storage=Storage(from_storage_url), to_name=to_name, to_storage=to_storage, to_format=to_fmt, )
def append_records_to_stored_datablock(self, name: str, storage: Storage, sdb: StoredDataBlockMetadata): self.resolve_new_object_with_data_block(sdb, name, storage) if sdb.data_format is None: fmt = infer_format_for_name(name, storage) # if sdb.data_format and sdb.data_format != fmt: # raise Exception(f"Format mismatch {fmt} - {sdb.data_format}") if fmt is None: raise Exception(f"Could not infer format {name} on {storage}") sdb.data_format = fmt # TODO: to_format # TODO: make sure this handles no-ops (empty object, same storage) # TODO: copy or alias? sometimes we are just moving temp obj to new name, dont need copy result = dcp.copy( from_name=name, from_storage=storage, to_name=sdb.get_name_for_storage(), to_storage=sdb.storage, to_format=sdb.data_format, available_storages=self.execution_context.storages, if_exists="append", ) logger.debug(f"Copied {result}") logger.debug(f"REMOVING NAME {name}") storage.get_api().remove(name)
def import_storage_csv(ctx: DataFunctionContext, name: str, storage_url: str, schema: Optional[str] = None): imported = ctx.get_state_value("imported") if imported: return # Static resource, if already emitted, return fs_api = Storage(storage_url).get_api() f = fs_api.open_name(name) ctx.emit_state_value("imported", True) ctx.emit(f, data_format=CsvFileFormat, schema=schema)
def import_storage_csv(ctx: SnapContext): imported = ctx.get_state_value("imported") if imported: return # Static resource, if already emitted, return name = ctx.get_param("name") storage_url = ctx.get_param("storage_url") fs_api = Storage(storage_url).get_api() f = fs_api.open_name(name) ctx.emit_state_value("imported", True) schema = ctx.get_param("schema") ctx.emit(f, data_format=CsvFileObjectFormat, schema=schema)
def test_database_handler(): dburl = get_tmp_sqlite_db_url() s = Storage(dburl) name = "_test" handler = get_handler(DatabaseTableFormat, s.storage_engine) handler().create_empty(name, s, test_records_schema) s.get_api().bulk_insert_records(name, test_records) assert list(handler().infer_field_names(name, s)) == list(test_records[0].keys()) assert handler().infer_field_type(name, s, "f1") == Text() assert handler().infer_field_type(name, s, "f2") == Integer() assert handler().infer_field_type(name, s, "f3") == DEFAULT_FIELD_TYPE assert handler().infer_field_type(name, s, "f4") == Date() assert handler().infer_field_type(name, s, "f5") == DEFAULT_FIELD_TYPE
def handle_emit( self, records_obj: Any = None, name: str = None, storage: Storage = None, output: str = DEFAULT_OUTPUT_NAME, data_format: DataFormat = None, schema: SchemaLike = None, ): logger.debug( f"HANDLING EMITTED OBJECT (of type '{type(records_obj).__name__}')" ) # TODO: can i return an existing DataBlock? Or do I need to create a "clone"? # Answer: ok to return as is (just mark it as 'output' in DBL) if isinstance(records_obj, StoredDataBlockMetadata): # TODO is it in local storage tho? we skip conversion below... # This is just special case right now to support SQL snap # Will need better solution for explicitly creating DB/SDBs inside of snaps return records_obj elif isinstance(records_obj, DataBlockMetadata): raise NotImplementedError elif isinstance(records_obj, ManagedDataBlock): raise NotImplementedError nominal_output_schema = schema if nominal_output_schema is None: nominal_output_schema = self.bound_interface.resolve_nominal_output_schema( self.env ) # TODO: could check output to see if it is LocalRecords with a schema too? if nominal_output_schema is not None: nominal_output_schema = self.env.get_schema(nominal_output_schema) sdb = self.get_stored_datablock_for_output(output) sdb.data_format = data_format db = sdb.data_block if db.nominal_schema_key and db.nominal_schema_key != nominal_output_schema.key: raise Exception( "Mismatch nominal schemas {db.nominal_schema_key} - {nominal_output_schema.key}" ) db.nominal_schema_key = nominal_output_schema.key if records_obj is not None: name = "_tmp_obj_" + rand_str(10) storage = self.execution_context.local_storage storage.get_api().put(name, records_obj) if nominal_output_schema is not None: # TODO: still unclear on when and why to do this cast handler = get_handler_for_name(name, storage) handler().cast_to_schema(name, storage, nominal_output_schema) sdb.storage_url = storage.url assert name is not None assert storage is not None self.append_records_to_stored_datablock(name, storage, sdb) return sdb
def test_make_copy_request(): name = "orders.csv" to_name = "orders" to_storage = "mysql://localhost:3306/mydb" to_url = f"{to_storage}/{to_name}" req = make_copy_request(name, to_url) pth = os.getcwd() assert req == CopyRequest( from_name=name, from_storage=Storage(f"file://{pth}"), to_name=to_name, to_storage=Storage(to_storage), to_format=DatabaseTableFormat, )
def test_memory_handlers(fmt: DataFormat, obj: Any): s = Storage("python://test") name = "_test" s.get_api().put(name, obj()) handler = get_handler(fmt, s.storage_engine) assert list(handler().infer_field_names(name, s)) == list(test_records[0].keys()) assert handler().infer_field_type(name, s, "f1") == Text() assert handler().infer_field_type(name, s, "f2") == Integer() assert handler().infer_field_type(name, s, "f3") == DEFAULT_FIELD_TYPE assert handler().infer_field_type(name, s, "f4") == Date() assert handler().infer_field_type(name, s, "f5") == DEFAULT_FIELD_TYPE handler().cast_to_field_type(name, s, "f4", Text()) handler().cast_to_field_type(name, s, "f4", Date()) round_trip_object = s.get_api().get(name) assert_objects_equal(round_trip_object, obj())
def test_db_to_mem(url): s: Storage = Storage.from_url(url) api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls() mem_s = new_local_python_storage() mem_api: PythonStorageApi = mem_s.get_api() if not s.get_api().dialect_is_supported(): return with api_cls.temp_local_database() as db_url: name = "_test" db_s = Storage.from_url(db_url) db_api: DatabaseStorageApi = db_s.get_api() db_api.execute_sql(f"create table {name} as select 1 a, 2 b") req = CopyRequest(name, db_s, name, mem_s, RecordsFormat, test_records_schema) DatabaseTableToRecords().copy(req) assert mem_api.get(name) == [{"a": 1, "b": 2}]
def provide_test_storages( function: DataFunction, target_storage: Storage) -> Iterator[Optional[Storage]]: if target_storage: yield target_storage # TODO elif function.required_storage_engines: # TODO: multiple engines -- is it AND or OR?? each entry is AND and inside entry commas delim OR eng = get_engine_for_scheme(function.required_storage_engines[0]) api_cls = eng.get_api_cls() if issubclass(api_cls, DatabaseApi): if not api_cls.dialect_is_supported(): raise TestFeatureNotImplementedError(eng) with api_cls.temp_local_database() as url: yield Storage(url) elif "database" in function.required_storage_classes: yield Storage(get_tmp_sqlite_db_url()) else: yield None
def append_records_to_stored_datablock(self, name: str, storage: Storage, sdb: StoredDataBlockMetadata): self.resolve_new_object_with_data_block(sdb, name, storage) if sdb.data_format is None: sdb.data_format = (self.execution_context.target_format or sdb.storage.storage_engine.get_natural_format()) # fmt = infer_format_for_name(name, storage) # # if sdb.data_format and sdb.data_format != fmt: # # raise Exception(f"Format mismatch {fmt} - {sdb.data_format}") # if fmt is None: # raise Exception(f"Could not infer format {name} on {storage}") # sdb.data_format = fmt # TODO: make sure this handles no-ops (empty object, same storage) # TODO: copy or alias? sometimes we are just moving temp obj to new name, dont need copy # to_name = sdb.get_name_for_storage() # if storage == sdb.storage: # # Same storage # if name == to_name: # # Nothing to do # logger.debug("Output already on storage with same name, nothing to do") # return # else: # # Same storage, just new name # # TODO: should be "rename" ideally (as it is if tmp gets deleted we lose it) # logger.debug("Output already on storage, creating alias") # storage.get_api().create_alias(name, to_name) # return logger.debug( f"Copying output from {name} {storage} to {sdb.get_name_for_storage()} {sdb.storage} ({sdb.data_format})" ) result = dcp.copy( from_name=name, from_storage=storage, to_name=sdb.get_name_for_storage(), to_storage=sdb.storage, to_format=sdb.data_format, available_storages=self.execution_context.storages, if_exists="append", ) logger.debug(f"Copied {result}") logger.debug(f"REMOVING NAME {name}") storage.get_api().remove(name)
def make_test_run_context(**kwargs) -> ExecutionContext: s = Storage.from_url(url=f"python://_test_default_{rand_str(6)}", ) env = make_test_env() args = dict( env=env, local_storage=s, target_storage=s, storages=[s], ) args.update(**kwargs) return ExecutionContext(**args)
def infer_format_for_name(name: str, storage: Storage) -> DataFormat: format_handlers = get_handlers_for_storage(storage) for handler in format_handlers: fmt = handler().infer_data_format(name, storage) if fmt is not None: return fmt msg = f"Could not infer format of object '{name}' on storage {storage}" if storage.storage_engine is LocalPythonStorageEngine: obj = storage.get_api().get(name) msg = f"Could not infer format of object '{name}' `{obj}`" raise NotImplementedError(msg)
def make_test_env(**kwargs) -> Environment: if "metadata_storage" not in kwargs: url = get_tmp_sqlite_db_url() metadata_storage = Storage.from_url(url) kwargs["metadata_storage"] = metadata_storage env = Environment(settings=SnapflowSettings(abort_on_function_error=True), **kwargs) test_module = SnapflowModule( "_test", ) for schema in [TestSchema1, TestSchema2, TestSchema3, TestSchema4]: env.add_schema(schema) env.add_module(test_module) return env
def test_records_to_db(url): s: Storage = Storage.from_url(url) api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls() if not s.get_api().dialect_is_supported(): warnings.warn( f"Skipping tests for database engine {s.storage_engine.__name__} (client library not installed)" ) return mem_s = new_local_python_storage() mem_api: PythonStorageApi = mem_s.get_api() with api_cls.temp_local_database() as db_url: name = "_test" db_s = Storage.from_url(db_url) db_api: DatabaseStorageApi = db_s.get_api() # Records mem_api.put(name, deepcopy(conformed_test_records)) req = CopyRequest(name, mem_s, name, db_s, DatabaseTableFormat, test_records_schema) RecordsToDatabaseTable().copy(req) with db_api.execute_sql_result(f"select * from {name}") as res: if url.startswith("sqlite"): assert [dict(r) for r in res] == test_records else: assert [dict(r) for r in res] == conformed_test_records
def test_records_to_file(): dr = tempfile.gettempdir() s: Storage = Storage.from_url(f"file://{dr}") fs_api: FileSystemStorageApi = s.get_api() mem_s = new_local_python_storage() mem_api: PythonStorageApi = mem_s.get_api() name = f"_test_{rand_str()}" obj = [{"f1": "hi", "f2": 2}] mem_api.put(name, obj) req = CopyRequest(name, mem_s, name, s, CsvFileFormat) RecordsToCsvFile().copy(req) with fs_api.open(name, newline="") as f: recs = list(read_csv(f)) handler = get_handler(RecordsFormat, mem_s.storage_engine) mem_api.put( "output", recs, ) handler().cast_to_schema("output", mem_s, schema=test_records_schema) recs = mem_api.get("output") assert recs == obj
def get_record_count(self, name: str, storage: Storage) -> Optional[int]: # Will come directly from storage engine most of time, except python memory implemented here if storage.storage_engine == LocalPythonStorageEngine: obj = storage.get_api().get(name) return len(obj) raise NotImplementedError
def storage(self) -> Storage: return Storage.from_url(self.storage_url)
def as_storage(self) -> Storage: return Storage(url=self.url)