def copy_delim_file_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, FileSystemStorageApi) assert isinstance(to_storage_api, PythonStorageApi) with from_storage_api.open(from_name) as f: records = list(read_csv(f.readlines())) mdr = as_records(records, data_format=RecordsFormat, schema=schema) mdr = mdr.conform_to_schema() to_storage_api.put(to_name, mdr)
def copy_file_object_iterator_to_records_iterator( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) itr = (read_csv(chunk) for chunk in with_header(mdr.records_object)) to_mdr = as_records(itr, data_format=RecordsIteratorFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_df_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) df = dataframe_to_records(mdr.records_object, schema) to_mdr = as_records(df, data_format=RecordsFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_file_object_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) obj = read_csv(mdr.records_object) to_mdr = as_records(obj, data_format=RecordsFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_df_iterator_to_records_iterator( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) itr = (dataframe_to_records(df, schema) for df in mdr.records_object) to_mdr = as_records(itr, data_format=RecordsIteratorFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_delim_file_to_file_object( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, FileSystemStorageApi) assert isinstance(to_storage_api, PythonStorageApi) with from_storage_api.open(from_name) as f: mdr = as_records(f, data_format=DelimitedFileObjectFormat, schema=schema) mdr = mdr.conform_to_schema() to_storage_api.put(to_name, mdr)
def copy_db_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, DatabaseStorageApi) assert isinstance(to_storage_api, PythonStorageApi) select_sql = f"select * from {from_name}" with from_storage_api.execute_sql_result(select_sql) as r: records = result_proxy_to_records(r) mdr = as_records(records, data_format=RecordsFormat, schema=schema) mdr = mdr.conform_to_schema() to_storage_api.put(to_name, mdr)
def copy_records_iterator_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) all_records = [] for records in mdr.records_object: all_records.extend(records) to_mdr = as_records(all_records, data_format=RecordsFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_db_to_cursor( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, DatabaseStorageApi) assert isinstance(to_storage_api, PythonStorageApi) select_sql = f"select * from {from_name}" conn = ( from_storage_api.get_engine().connect() ) # Gonna leave this connection hanging... # TODO: add "closeable" to the MDR and handle? r = conn.execute(select_sql) mdr = as_records(r, data_format=DatabaseCursorFormat, schema=schema) mdr = mdr.conform_to_schema() to_storage_api.put(to_name, mdr)
def copy_file_object_to_records_iterator( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) # Note: must keep header on each chunk when iterating delimited file object! # TODO: ugly hard-coded 1000 here, but how could we ever make it configurable? Not a big deal I guess itr = (read_csv(chunk) for chunk in with_header(iterate_chunks(mdr.records_object, 1000))) to_mdr = as_records(itr, data_format=RecordsIteratorFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_dataframe_iterator_to_dataframe( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) all_dfs = [] for df in mdr.records_object: all_dfs.append(df) to_mdr = as_records(pd.concat(all_dfs), data_format=DataFrameFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)