def test_new_metadata(): first = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_first.zip"))) second = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_second.zip"))) result = merge_datapackages_with_mask( first_dp=first, first_resource_group_label="sa-data-vector", second_dp=second, second_resource_group_label="sa-data-array", mask_array=np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=bool), metadata={ "name": "something something", "id_": "danger zone", "combinatorial": True, "sequential": False, "seed": 2000, "foo bar baz": True, }, ) assert result.metadata["name"] == "something_something" assert result.metadata["id"] == "danger zone" assert result.metadata["combinatorial"] assert not result.metadata["sequential"] assert result.metadata["seed"] == 2000 assert result.metadata["foo bar baz"]
def test_basic_merging_functionality(): first = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_first.zip"))) second = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_second.zip"))) result = merge_datapackages_with_mask( first_dp=first, first_resource_group_label="sa-data-vector", second_dp=second, second_resource_group_label="sa-data-array", mask_array=np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=bool), ) assert isinstance(result, DatapackageBase) assert isinstance(result.fs, MemoryFS) assert len(result.resources) == 5 d, r = result.get_resource("sa-data-vector.data") assert r["name"] == "sa-data-vector.data" assert r["path"] == "sa-data-vector.data.npy" assert r["group"] == "sa-data-vector" assert r["nrows"] == 5 assert np.allclose(d, np.array([0, 2, 4, 6, 8])) d, r = result.get_resource("sa-data-array.data") assert r["name"] == "sa-data-array.data" assert r["path"] == "sa-data-array.data.npy" assert r["group"] == "sa-data-array" assert r["nrows"] == 5 assert d.shape == (5, 10) assert np.allclose(d[:, 0], np.array([1, 3, 5, 7, 9]) + 10)
def extract_file_from_zip(cache_fs, cache_path, url, fn_pattern=None): """ For a zip archive, return the first file if no file_name is specified as a fragment in the url, or if a file_name is specified, use it as a regex to find a file in the archive :param cache_fs: :param cache_path: :param url: :return: """ from fs.zipfs import ZipOpenError # FIXME Not sure what is going on here, but in multiproccessing mode, # the 'try' version of opening the file can fail with an error about the file being missing or corrupy # but the second successedes. However, the second will faile in test environments that # have a memory cache. try: fs = ZipFS(cache_fs.open(cache_path, 'rb')) except ZipOpenError: fs = ZipFS(cache_fs.getsyspath(cache_path)) fstor = None def walk_all(fs): return [join(e[0], x) for e in fs.walk() for x in e[1]] if not fn_pattern and '#' in url: _, fn_pattern = url.split('#') if not fn_pattern: first = walk_all(fs)[0] fstor = DelayedOpen(fs, first, 'rU', container=(cache_fs, cache_path)) else: for file_name in walk_all(fs): if '_MACOSX' in file_name: continue if re.search(fn_pattern, file_name): fstor = DelayedOpen(fs, file_name, 'rb', container=(cache_fs, cache_path)) break if not fstor: raise ConfigurationError( "Failed to get file for pattern '{}' from archive {}".format(fn_pattern, fs)) return fstor
def test_shape_mismatch_mask(): first = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_first.zip"))) second = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_second.zip"))) with pytest.raises(LengthMismatch): merge_datapackages_with_mask( first_dp=first, first_resource_group_label="sa-data-vector", second_dp=second, second_resource_group_label="sa-data-array", mask_array=np.array([1, 0, 1, 0, 1, 0, 1, 0], dtype=bool), )
def test_write_new_datapackage(): first = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_first.zip"))) second = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_second.zip"))) with tempfile.TemporaryDirectory() as td: temp_fs = OSFS(td) result = merge_datapackages_with_mask( first_dp=first, first_resource_group_label="sa-data-vector", second_dp=second, second_resource_group_label="sa-data-array", mask_array=np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=bool), output_fs=temp_fs, ) result = load_datapackage(OSFS(td)) assert isinstance(result, DatapackageBase) assert not isinstance(result.fs, MemoryFS) assert len(result.resources) == 5 for suffix in {"indices", "data", "distributions", "flip"}: try: d, r = result.get_resource(f"sa-data-vector.{suffix}") except KeyError: continue assert r["name"] == f"sa-data-vector.{suffix}" assert r["path"] == f"sa-data-vector.{suffix}.npy" assert r["group"] == "sa-data-vector" assert r["nrows"] == 5 if suffix == "data": assert np.allclose(d, np.array([0, 2, 4, 6, 8])) try: d, r = result.get_resource(f"sa-data-array.{suffix}") except KeyError: continue assert r["name"] == f"sa-data-array.{suffix}" assert r["path"] == f"sa-data-array.{suffix}.npy" assert r["group"] == "sa-data-array" assert r["nrows"] == 5 if suffix == "data": assert d.shape == (5, 10) assert np.allclose(d[:, 0], np.array([1, 3, 5, 7, 9]) + 10)
def generate_local_sa_biosphere_datapackage(cutoff=1e-4, const_factor=10): lca = setup_bw_project_archetypes() uncertain_biosphere_exchanges = filter_uncertain_biosphere_exchanges( lca, cutoff) dp = bwp.create_datapackage( fs=ZipFS(str(DATA_DIR / "local-sa-biosphere.zip"), write=True), name="local sa biosphere", ) amounts = np.array([exc.amount for exc in uncertain_biosphere_exchanges]) num_samples = len(amounts) data_array = np.tile(amounts, num_samples) * (np.diag( np.ones(num_samples) * const_factor)) indices_array = np.array( [(exc.input.id, exc.output.id) for exc in uncertain_biosphere_exchanges], dtype=bwp.INDICES_DTYPE, ) # All inputs -> all True flip_array = np.ones(len(indices_array), dtype=bool) dp.add_persistent_array( matrix="biosphere_matrix", data_array=data_array, name="local sa biosphere", indices_array=indices_array, flip_array=flip_array, ) dp.finalize_serialization()
def test_processed_array(): database = DatabaseChooser("a database") database.write( { ("a database", "2"): { "type": "process", "exchanges": [ { "input": ("a database", "2"), "amount": 42, "uncertainty_type": 7, "type": "production", } ], } } ) package = load_datapackage(ZipFS(database.filepath_processed())) print(package.resources) array = package.get_resource("a_database_technosphere_matrix.data")[0] assert array.shape == (1,) assert array[0] == 42 array = package.get_resource("a_database_technosphere_matrix.distributions")[0] assert array.shape == (1,) assert array[0]["uncertainty_type"] == 7
def empty_biosphere(): # Flow 1: The flow # Activity 1: The activity dp = create_datapackage(fs=ZipFS(str(fixture_dir / "empty_biosphere.zip"), write=True), ) data_array = np.array([1, 2, 3]) indices_array = np.array([(2, 1), (1, 1), (2, 2)], dtype=INDICES_DTYPE) flip_array = np.array([1, 0, 0], dtype=bool) dp.add_persistent_vector( matrix="technosphere_matrix", data_array=data_array, name="eb-technosphere", indices_array=indices_array, nrows=3, flip_array=flip_array, ) data_array = np.array([1]) indices_array = np.array([(1, 0)], dtype=INDICES_DTYPE) dp.add_persistent_vector( matrix="characterization_matrix", data_array=data_array, name="eb-characterization", indices_array=indices_array, global_index=0, nrows=1, ) dp.finalize_serialization()
def process(self, **extra_metadata): """ Process intermediate data from a Python dictionary to a `stats_arrays <https://pypi.python.org/pypi/stats_arrays/>`_ array, which is a `NumPy <http://numpy.scipy.org/>`_ `Structured <http://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html#numpy.recarray>`_ `Array <http://docs.scipy.org/doc/numpy/user/basics.rec.html>`_. A structured array (also called record array) is a heterogeneous array, where each column has a different label and data type. Processed arrays are saved in the ``processed`` directory. If the uncertainty type is no uncertainty, undefined, or not specified, then the 'amount' value is used for 'loc' as well. This is needed for the random number generator. Doesn't return anything, but writes a file to disk. """ data = self.load() dp = create_datapackage( fs=ZipFS(str(self.filepath_processed()), write=True), name=self.filename_processed(), sum_intra_duplicates=True, sum_inter_duplicates=False, ) dp.add_persistent_vector_from_iterator( matrix=self.matrix, name=clean_datapackage_name(str(self.name) + " matrix data"), dict_iterator=(self.process_row(row) for row in data), nrows=len(data), **extra_metadata) dp.finalize_serialization()
def get_fs(cls, registry, fs_name, fs_name_params, fs_path, writeable, create_dir): zip_fs, zip_path = registry.parse(fs_path) if zip_path is None: raise OpenerError('File required for zip opener') if zip_fs.exists(zip_path): if writeable: open_mode = 'r+b' else: open_mode = 'rb' else: open_mode = 'w+' if zip_fs.hassyspath(zip_path): zip_file = zip_fs.getsyspath(zip_path) else: zip_file = zip_fs.open(zip_path, mode=open_mode) _username, _password, fs_path = _parse_credentials(fs_path) from fs.zipfs import ZipFS if zip_file is None: zip_file = fs_path mode = 'r' if writeable: mode = 'a' allow_zip_64 = fs_name.endswith('64') zipfs = ZipFS(zip_file, mode=mode, allow_zip_64=allow_zip_64) return zipfs, None
def test_ordering(): dps = [ load_datapackage(ZipFS(dirpath / "b-second.zip")), load_datapackage(ZipFS(dirpath / "a-first.zip")), ] for dp in dps: dp.rehydrate_interface("w-fourth", Interface()) print(list(dp.groups)) mm = MappedMatrix(packages=dps, matrix="matrix-a") assert [grp.label for grp in mm.groups] == [ "y-second", "w-fourth", "y-second", "w-fourth", ]
def copy_dataset_to_mem_fs(mem_fs, dataset_zip_file_path): tf.logging.info('Copying dataset to in-memory filesystem.') dataset_path, dataset_zip_filename = os.path.split(dataset_zip_file_path) with fs.open_fs(dataset_path) as host_fs: # Could be local or GCS with host_fs.open(dataset_zip_filename, 'rb') as zip_file: with ZipFS(zip_file) as zip_fs: fs.copy.copy_dir(zip_fs, '.', mem_fs, '.')
def set(self, key, value): fs = None try: if not key: return key = key.format(accountDBID=utils.getAccountDBID()) fullFileName = os.path.join(self.cache_dir, '{0}.dat'.format(key)) dirName = os.path.dirname(fullFileName) pkg = os.path.basename(dirName) fileName = os.path.basename(fullFileName) isZip = pkg.lower().endswith('.zip') save = True if isZip: fs = ZipFS(dirName, mode='a', compression='stored') if fs.exists(fileName): log('[WARNING] archive "{}" already contains file "{}". Do not save the new data.' .format(pkg, fileName)) save = False else: fs = OSFS(dirName, create=True) if save: fs.setcontents(fileName, cPickle.dumps(value)) except Exception: err(traceback.format_exc()) finally: if fs is not None: fs.close()
def create_ordering_datapackages(): dp = create_datapackage( fs=ZipFS(str(dirpath / "a-first.zip"), write=True), name="test-fixture-a", id_="fixture-a", ) add_data(dp) dp.finalize_serialization() dp = create_datapackage( fs=ZipFS(str(dirpath / "b-second.zip"), write=True), name="test-fixture-b", id_="fixture-b", ) add_data(dp) dp.finalize_serialization()
def get(self, key, default): fs = None try: fullFileName = os.path.join(self.cache_dir, '{0}.dat'.format(key)) dirName = os.path.dirname(fullFileName) pkg = os.path.basename(dirName) fileName = os.path.basename(fullFileName) isZip = pkg.lower().endswith('.zip') if os.path.exists(dirName): if isZip: fs = ZipFS(dirName, mode='r', compression='stored') else: fs = OSFS(dirName, create=True) if fs.exists(fileName): try: #log(fileName) #log(cPickle.loads(fs.getcontents(fileName))) return cPickle.loads(fs.getcontents(fileName)) except Exception: if isZip: log('[WARNING] Broken file: %s' % fullFileName) else: log('[WARNING] Remove broken file: %s' % fullFileName) fs.remove(fileName) raise return default except Exception: err(traceback.format_exc()) return default finally: if fs is not None: fs.close()
def test_url_on_sys_path(self): t = TempFS() zpath = t.getsyspath("modules.zip") z = ZipFS(zpath, "w") self._init_modules(z) z.close() z = ZipFS(zpath, "r") assert z.isfile("fsih_hello.py") z.close() sys.path.append("zip://" + zpath) FSImportHook.install() try: self._check_imports_are_working() finally: sys.path_hooks.remove(FSImportHook) sys.path.pop() t.close()
def test_process_without_exchanges_still_in_processed_array(): database = DatabaseChooser("a database") database.write({("a database", "foo"): {}}) package = load_datapackage(ZipFS(database.filepath_processed())) array = package.get_resource("a_database_technosphere_matrix.data")[0] assert array[0] == 1 assert array.shape == (1,)
def upload_docs(self, lib_name, lib_version): args = self.args archive, lib = build.build_lib(args.location, ignore_errors=True) lib_name = lib.long_name from ..docgen.extracter import Extracter extract_fs = TempFS('moyadoc-{}'.format(lib_name)) extracter = Extracter(archive, extract_fs) extracter.extract_lib(lib_name) _fh, temp_filename = tempfile.mkstemp('moyadocs') with ZipFS(temp_filename, 'w') as docs_zip_fs: fs.copy.copy_dir(extract_fs, '/', docs_zip_fs, '/') package_filename = "{}-{}.docs.zip".format(lib_name, lib_version) upload_info = self.call('package.get-upload-info') docs_url = upload_info['docs_url'] self.console("uploading '{}'...".format(package_filename)).nl() with io.open(temp_filename, 'rb') as package_file: files = [('file', (package_filename, package_file, 'application/octet-stream'))] data = { "auth": self.auth_token, "package": lib_name, "version": lib_version } response = requests.post(docs_url, verify=False, files=files, data=data, hooks={}) if response.status_code != 200: raise CommandError( "upload failed -- server returned {} response".format( response.status_code)) message = decode_utf8_bytes( response.headers.get('moya-upload-package-message', '')) result = decode_utf8_bytes( response.headers.get('moya-upload-package-result', '')) if result == 'success': self.server_response(message, fg="green") else: raise CommandError('upload error ({})'.format(message)) if result == "success": pass else: self.console.error("upload failed")
def test_integration_test_new_zipfile(): with tempfile.TemporaryDirectory() as td: dp = create_datapackage( fs=ZipFS(str(Path(td) / "foo.zip"), write=True), name="test-fixture", id_="fixture-42", ) add_data(dp) dp.finalize_serialization() check_metadata(dp) check_data(dp) loaded = load_datapackage(ZipFS(str(Path(td) / "foo.zip"), write=False)) check_metadata(loaded, False) check_data(loaded)
def GetZipFile(_self, _zipfile): # print("ZipFileManager::GetZipFile") projectsFS = _self.m_ZipFileList.get(_zipfile) if projectsFS == None: print("ZipFileManager::GetZipFile " + str(_zipfile)) # projectsFS = ZipFS(_zipfile, mode = 'r') projectsFS = ZipFS(_zipfile) _self.m_ZipFileList[_zipfile] = projectsFS return projectsFS
def test_group_ordering_consistent(): dp = load_datapackage(ZipFS(dirpath / "test-fixture.zip")) assert list(dp.groups) == [ "sa-data-vector-from-dict", "sa-data-vector", "sa-data-array", "sa-vector-interface", "sa-array-interface", ]
def test_default_metadata(): first = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_first.zip"))) second = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_second.zip"))) result = merge_datapackages_with_mask( first_dp=first, first_resource_group_label="sa-data-vector", second_dp=second, second_resource_group_label="sa-data-array", mask_array=np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=bool), ) assert result.metadata["name"] assert result.metadata["id"] assert not result.metadata["combinatorial"] assert not result.metadata["sequential"] assert not result.metadata["seed"]
def test_database_process_adds_correct_geo(add_biosphere): database = Database("food") database.write(food) package = load_datapackage(ZipFS(database.filepath_processed())) data = package.get_resource("food_inventory_geomapping_matrix.indices")[0] assert geomapping["CA"] in data["col"].tolist() assert geomapping["CH"] in data["col"].tolist()
def test_add_suffix(): first = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_same_1.zip"))) second = load_datapackage( ZipFS(str(fixture_dir / "merging" / "merging_same_2.zip"))) with pytest.warns(UserWarning): result = merge_datapackages_with_mask( first_dp=first, first_resource_group_label="same", second_dp=second, second_resource_group_label="same", mask_array=np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=bool), ) assert isinstance(result, DatapackageBase) assert len(result.resources) == 5 for suffix in {"indices", "data", "distributions", "flip"}: try: d, r = result.get_resource(f"same_true.{suffix}") except KeyError: continue assert r["name"] == f"same_true.{suffix}" assert r["path"] == f"same_true.{suffix}.npy" assert r["group"] == "same_true" assert r["nrows"] == 5 if suffix == "data": assert np.allclose(d, np.array([0, 2, 4, 6, 8])) try: d, r = result.get_resource(f"same_false.{suffix}") except KeyError: continue assert r["name"] == f"same_false.{suffix}" assert r["path"] == f"same_false.{suffix}.npy" assert r["group"] == "same_false" assert r["nrows"] == 5 if suffix == "data": assert d.shape == (5, 10) assert np.allclose(d[:, 0], np.array([1, 3, 5, 7, 9]) + 10)
def process_delta_database(name, tech, bio, dependents): """A modification of ``bw2data.backends.base.SQLiteBackend.process`` to skip retrieving data from the database.""" print("Tech:", tech) print("Bio:", bio) db = bd.Database(name) db.metadata["processed"] = datetime.datetime.now().isoformat() # Create geomapping array, from dataset interger ids to locations inv_mapping_qs = ActivityDataset.select( ActivityDataset.id, ActivityDataset.location ).where(ActivityDataset.database == name, ActivityDataset.type == "process") # self.filepath_processed checks if data is dirty, # and processes if it is. This causes an infinite loop. # So we construct the filepath ourselves. fp = str(db.dirpath_processed() / db.filename_processed()) dp = bwp.create_datapackage( fs=ZipFS(fp, write=True), name=bwp.clean_datapackage_name(name), sum_intra_duplicates=True, sum_inter_duplicates=False, ) dp.add_persistent_vector_from_iterator( matrix="inv_geomapping_matrix", name=bwp.clean_datapackage_name(name + " inventory geomapping matrix"), dict_iterator=( { "row": row[0], "col": bd.geomapping[ bd.backends.utils.retupleize_geo_strings(row[1]) or bd.config.global_location ], "amount": 1, } for row in inv_mapping_qs.tuples() ), nrows=inv_mapping_qs.count(), ) dp.add_persistent_vector_from_iterator( matrix="biosphere_matrix", name=bwp.clean_datapackage_name(name + " biosphere matrix"), dict_iterator=bio, ) dp.add_persistent_vector_from_iterator( matrix="technosphere_matrix", name=bwp.clean_datapackage_name(name + " technosphere matrix"), dict_iterator=tech, ) dp.finalize_serialization() db.metadata["depends"] = sorted(dependents.difference({name})) db.metadata["dirty"] = False db._metadata.flush()
def generic_zipfile_filesystem(*, dirpath: Path, filename: str, write: bool = True) -> ZipFS: assert isinstance(dirpath, Path), "`dirpath` must be a `pathlib.Path` instance" if not dirpath.is_dir(): raise ValueError( "Destination directory `{}` doesn't exist".format(dirpath)) return ZipFS(dirpath / filename, write=write)
def test_metadata_is_the_same_object(): dp = load_datapackage(fs_or_obj=ZipFS(str(dirpath / "test-fixture.zip"))) fdp = dp.filter_by_attribute("matrix", "sa_matrix") for k, v in fdp.metadata.items(): if k != "resources": assert id(v) == id(dp.metadata[k]) for resource in fdp.resources: assert any(obj for obj in dp.resources if obj is resource)
def test_data_is_the_same_object_when_not_proxy(): dp = load_datapackage(fs_or_obj=ZipFS(str(dirpath / "test-fixture.zip"))) fdp = dp.filter_by_attribute("matrix", "sa_matrix") arr1, _ = dp.get_resource("sa-data-array.data") arr2, _ = fdp.get_resource("sa-data-array.data") assert np.allclose(arr1, arr2) assert arr1 is arr2 assert np.shares_memory(arr1, arr2)
def test_integration_test_fixture_zipfile(): loaded = load_datapackage( ZipFS( str( Path(__file__).parent.resolve() / "fixtures" / "test-fixture.zip"), write=False, )) check_metadata(loaded, False) check_data(loaded)
def test_fdp_can_load_proxy_first(): dp = load_datapackage(fs_or_obj=ZipFS(str(dirpath / "test-fixture.zip")), proxy=True) fdp = dp.filter_by_attribute("matrix", "sa_matrix") arr2, _ = fdp.get_resource("sa-data-array.data") arr1, _ = dp.get_resource("sa-data-array.data") assert np.allclose(arr1, arr2) assert arr1.base is not arr2 assert arr2.base is not arr1 assert not np.shares_memory(arr1, arr2)