def test_records_to_db(url): s: Storage = Storage.from_url(url) api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls() if not s.get_api().dialect_is_supported(): warnings.warn( f"Skipping tests for database engine {s.storage_engine.__name__} (client library not installed)" ) return mem_api: PythonStorageApi = new_local_python_storage().get_api() with api_cls.temp_local_database() as db_url: name = "_test" db_api: DatabaseStorageApi = Storage.from_url(db_url).get_api() # Records mdr = as_records(records) mem_api.put(name, mdr) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, RecordsFormat), StorageFormat(s.storage_engine, DatabaseTableFormat), ) copy_records_to_db.copy(name, name, conversion, mem_api, db_api, schema=TestSchema4) with db_api.execute_sql_result(f"select * from {name}") as res: assert [dict(r) for r in res] == records
def test_storage(): s = Storage.from_url("sqlite://") assert s.storage_engine is SqliteStorageEngine s = Storage.from_url("postgres://localhost") assert s.storage_engine is PostgresStorageEngine s = Storage.from_url("mysql://localhost") assert s.storage_engine is MysqlStorageEngine s = Storage.from_url("file:///") assert s.storage_engine is LocalFileSystemStorageEngine s = Storage.from_url("python://") assert s.storage_engine is LocalPythonStorageEngine
def test_storage_api(): s = Storage.from_url("sqlite://").get_api() assert isinstance(s, DatabaseStorageApi) s = Storage.from_url("postgres://localhost").get_api() assert isinstance(s, PostgresDatabaseStorageApi) s = Storage.from_url("mysql://localhost").get_api() assert isinstance(s, MysqlDatabaseStorageApi) s = Storage.from_url("file:///").get_api() assert isinstance(s, FileSystemStorageApi) s = Storage.from_url("python://").get_api() assert isinstance(s, PythonStorageApi)
def create_data_block_from_records( env: Environment, sess: Session, local_storage: Storage, records: Any, nominal_schema: Schema = None, inferred_schema: Schema = None, created_by_node_key: str = None, ) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]: from snapflow.storage.storage import LocalPythonStorageEngine logger.debug("CREATING DATA BLOCK") if isinstance(records, MemoryDataRecords): dro = records # Important: override nominal schema with DRO entry if it exists if dro.nominal_schema is not None: nominal_schema = env.get_schema(dro.nominal_schema, sess) else: dro = as_records(records, schema=nominal_schema) if not nominal_schema: nominal_schema = env.get_schema("Any", sess) if not inferred_schema: inferred_schema = dro.data_format.infer_schema_from_records( dro.records_object) env.add_new_generated_schema(inferred_schema, sess) realized_schema = cast_to_realized_schema(env, sess, inferred_schema, nominal_schema) dro = dro.conform_to_schema(realized_schema) block = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key=inferred_schema.key if inferred_schema else None, nominal_schema_key=nominal_schema.key, realized_schema_key=realized_schema.key, record_count=dro.record_count, created_by_node_key=created_by_node_key, ) sdb = StoredDataBlockMetadata( # type: ignore id=get_datablock_id(), data_block_id=block.id, data_block=block, storage_url=local_storage.url, data_format=dro.data_format, ) sess.add(block) sess.add(sdb) # sess.flush([block, sdb]) local_storage.get_api().put(sdb.get_name(), dro) return block, sdb
def test_database_api_core_operations(url): s: Storage = Storage.from_url(url) api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls() if not s.get_api().dialect_is_supported(): return with api_cls.temp_local_database() as db_url: api: DatabaseApi = Storage.from_url(db_url).get_api() name = "_test" api.execute_sql(f"create table {name} as select 1 a, 2 b") assert api.exists(name) assert not api.exists(name + "doesntexist") assert api.record_count(name) == 1 api.create_alias(name, name + "alias") assert api.record_count(name + "alias") == 1 api.copy(name, name + "copy") assert api.record_count(name + "copy") == 1
def test_filesystem_api_core_operations(url): api: PythonStorageApi = Storage.from_url(url).get_api() name = "_test" api.put(name, as_records([{"a": 1}, {"b": 2}])) assert api.exists(name) assert not api.exists(name + "doesntexist") assert api.record_count(name) == 2 api.create_alias(name, name + "alias") assert api.record_count(name + "alias") == 2 api.copy(name, name + "copy") assert api.record_count(name + "copy") == 2
def make_test_env(**kwargs) -> Environment: if "metadata_storage" not in kwargs: url = "sqlite://" metadata_storage = Storage.from_url(url) kwargs["metadata_storage"] = metadata_storage env = Environment(**kwargs) test_module = SnapflowModule( "_test", schemas=[TestSchema1, TestSchema2, TestSchema3, TestSchema4], ) env.add_module(test_module) return env
def test_filesystem_api_core_operations(url): api: FileSystemStorageApi = Storage.from_url(url).get_api() name = "_test" pth = os.path.join(url[7:], name) with open(pth, "w") as f: f.writelines(["f1,f2\n", "1,2\n"]) assert api.exists(name) assert not api.exists(name + "doesntexist") assert api.record_count(name) == 2 api.create_alias(name, name + "alias") assert api.record_count(name + "alias") == 2 api.copy(name, name + "copy") assert api.record_count(name + "copy") == 2
def make_test_run_context(**kwargs) -> RunContext: s = Storage.from_url(url=f"python://_test_default_{rand_str(6)}", ) env = make_test_env() g = Graph(env) args = dict( graph=g, env=env, runtimes=[Runtime.from_storage(s)], storages=[s], local_python_storage=s, target_storage=s, ) args.update(**kwargs) return RunContext(**args)
def load_environment_from_project(project: Any) -> Environment: from snapflow.storage.storage import Storage env = Environment( metadata_storage=getattr(project, "metadata_storage", None), add_default_python_runtime=getattr(project, "add_default_python_runtime", True), ) for url in getattr(project, "storages", []): env.add_storage(Storage.from_url(url)) for module_name in getattr(project, "modules", []): m = import_module(module_name) env.add_module(m) # type: ignore # We hijack the module return env
def test_db_to_mem(url): s: Storage = Storage.from_url(url) api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls() mem_api: PythonStorageApi = new_local_python_storage().get_api() if not s.get_api().dialect_is_supported(): return with api_cls.temp_local_database() as db_url: api: DatabaseStorageApi = Storage.from_url(db_url).get_api() name = "_test" api.execute_sql(f"create table {name} as select 1 a, 2 b") # Records conversion = Conversion( StorageFormat(s.storage_engine, DatabaseTableFormat), StorageFormat(LocalPythonStorageEngine, RecordsFormat), ) copy_db_to_records.copy(name, name, conversion, api, mem_api) assert mem_api.get(name).records_object == [{"a": 1, "b": 2}] # DatabaseCursor conversion = Conversion( StorageFormat(s.storage_engine, DatabaseTableFormat), StorageFormat(LocalPythonStorageEngine, DatabaseCursorFormat), ) copy_db_to_records.copy(name, name, conversion, api, mem_api) assert list(mem_api.get(name).records_object) == [{"a": 1, "b": 2}]
def __init__( self, name: str = None, metadata_storage: Union["Storage", str] = None, add_default_python_runtime: bool = True, initial_modules: List[ SnapflowModule] = None, # Defaults to `core` module ): from snapflow.core.runtime import Runtime, LocalPythonRuntimeEngine from snapflow.storage.storage import Storage, new_local_python_storage from snapflow.modules import core self.name = name if metadata_storage is None: metadata_storage = DEFAULT_METADATA_STORAGE_URL logger.warning( f"No metadata storage specified, using default sqlite db `{DEFAULT_METADATA_STORAGE_URL}`" ) if isinstance(metadata_storage, str): metadata_storage = Storage.from_url(metadata_storage) if metadata_storage is None: raise Exception("Must specify metadata_storage or allow default") self.metadata_storage = metadata_storage self.initialize_metadata_database() self._local_module = DEFAULT_LOCAL_MODULE self.library = ComponentLibrary() self.storages = [] self.runtimes = [] self._metadata_sessions: List[Session] = [] # if add_default_python_runtime: # self.runtimes.append( # Runtime( # url="python://local", # runtime_engine=LocalPythonRuntimeEngine, # ) # ) if initial_modules is None: initial_modules = [core] for m in initial_modules: self.add_module(m) self._local_python_storage = new_local_python_storage() self.add_storage(self._local_python_storage) self.runtimes.append(Runtime.from_storage(self._local_python_storage))
def test_file_to_mem(): dr = tempfile.gettempdir() s: Storage = Storage.from_url(f"file://{dr}") fs_api: FileSystemStorageApi = s.get_api() mem_api: PythonStorageApi = new_local_python_storage().get_api() name = "_test" fs_api.write_lines_to_file(name, ["f1,f2", "hi,2"]) # Records records_obj = [{"f1": "hi", "f2": 2}] conversion = Conversion( StorageFormat(s.storage_engine, DelimitedFileFormat), StorageFormat(LocalPythonStorageEngine, RecordsFormat), ) copy_delim_file_to_records.copy(name, name, conversion, fs_api, mem_api, schema=TestSchema4) assert mem_api.get(name).records_object == records_obj
def test_obj_to_file(): dr = tempfile.gettempdir() s: Storage = Storage.from_url(f"file://{dr}") fs_api: FileSystemStorageApi = s.get_api() mem_api: PythonStorageApi = new_local_python_storage().get_api() name = "_test" fmt = DelimitedFileObjectFormat obj = (lambda: StringIO("f1,f2\nhi,2"), )[0] mdr = as_records(obj(), data_format=fmt) mem_api.put(name, mdr) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, fmt), StorageFormat(s.storage_engine, DelimitedFileFormat), ) copy_file_object_to_delim_file.copy(name, name, conversion, mem_api, fs_api, schema=TestSchema4) with fs_api.open(name) as f: assert f.read() == obj().read()
def test_records_to_file(): dr = tempfile.gettempdir() s: Storage = Storage.from_url(f"file://{dr}") fs_api: FileSystemStorageApi = s.get_api() mem_api: PythonStorageApi = new_local_python_storage().get_api() name = "_test" fmt = RecordsFormat obj = [{"f1": "hi", "f2": 2}] mdr = as_records(obj, data_format=fmt) mem_api.put(name, mdr) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, fmt), StorageFormat(s.storage_engine, DelimitedFileFormat), ) copy_records_to_delim_file.copy(name, name, conversion, mem_api, fs_api, schema=TestSchema4) with fs_api.open(name) as f: recs = list(read_csv(f)) recs = RecordsFormat.conform_records_to_schema(recs, TestSchema4) assert recs == obj
def add_storage(self, storage_like: Union[Storage, str], add_runtime: bool = True) -> Storage: from snapflow.storage.storage import Storage if isinstance(storage_like, str): sr = Storage.from_url(storage_like) elif isinstance(storage_like, Storage): sr = storage_like else: raise TypeError for s in self.storages: if s.url == sr.url: return s self.storages.append(sr) if add_runtime: from snapflow.core.runtime import Runtime try: rt = Runtime.from_storage(sr) self.runtimes.append(rt) except ValueError: pass return sr
def storage(self) -> Storage: from snapflow.storage.storage import Storage return Storage.from_url(self.storage_url)
def as_storage(self) -> Storage: return Storage( url=self.url, storage_engine=self.runtime_engine.natural_storage_engine, )