def access_precomputed( store_path: str, key: str, mode: str, array_type=None, dtype=None, num_channels=None, shape=None, resolution=None, encoding=None, chunks=None, jpeg_quality=None, voxel_offset=None, scale_index=None, ) -> TensorStoreArray: driver = "neuroglancer_precomputed" kvstore_driver, _store_path = fsspec.core.split_protocol(store_path) if kvstore_driver == None: kvstore_driver = "file" kvstore_path = "/" # remove the leading slash after making the absolute path _store_path = os.path.abspath(_store_path)[1:] else: kvstore_path = _store_path.split(os.path.sep)[0] if kvstore_driver not in KVSTORE_DRIVERS: raise ValueError( f"File system protocol {kvstore_driver} is not supported by tensorstore." ) info_path = os.path.join(store_path, "info") if mode == 'r': with fsspec.open(info_path) as fh: json_data = json.loads(fh.read()) precomputed_metadata = parse_info(json_data) scale_matches = [ scale.key == key for scale in precomputed_metadata.scales ] if not any(scale_matches): raise ValueError( 'Could not find key: {key} in info file at {info_path}') else: scale_index = scale_matches.index(True) scale_meta = precomputed_metadata.scales[scale_index] else: scale_meta = ScaleMetadata( size=shape, resolution=resolution, encoding=encoding, chunk_size=chunks, key=key, voxel_offset=voxel_offset, jpeg_quality=jpeg_quality, ) precomputed_metadata = PrecomputedMetadata(type=array_type, data_type=dtype, num_channels=num_channels, scales=[scale_meta]) if mode == "r": read = True # So cool that tensorstore errors when these are set to False for reading... write = None create = None delete_existing = None elif mode == 'a': read = True write = True create = True delete_existing = False elif mode == "rw": read = True write = True create = True delete_existing = True elif mode == "w": read = False write = True create = True delete_existing = True elif mode == "w-": read = False write = True create = True delete_existing = False else: raise ValueError('Mode must be "r", "rw", "a", "w", or "w-"') tsa = TensorStoreArray(driver=driver, path=_store_path, kvstore_path=kvstore_path, kvstore_driver=kvstore_driver, encoding=scale_meta.encoding, scale_index=scale_index, key=scale_meta.key, num_channels=precomputed_metadata.num_channels, volume_type=precomputed_metadata.type, dtype=precomputed_metadata.data_type, resolution=scale_meta.resolution, size=scale_meta.size, chunk_size=scale_meta.chunk_size, jpeg_quality=jpeg_quality) return tsa.open(read=read, write=write, create=create, delete_existing=delete_existing).result()
def input_opener(fname, **kwargs): logger.info(f"Opening input '{fname}'") with fsspec.open(fname, **kwargs) as f: yield f
def read_text_from_href(self, href: str, *args: Any, **kwargs: Any) -> str: with fsspec.open(href, "r") as f: return f.read()
def read_bytes( vineyard_socket: str, path: str, storage_options: Dict, read_options: Dict, proc_num: int, proc_index: int, ): """Read bytes from external storage and produce a ByteStream, which will later be assembled into a ParallelStream. Args: vineyard_socket (str): Ipc socket path (str): External storage path to write to storage_options (dict): Configurations of external storage read_options (dict): Additional options that could control the behavior of read proc_num (int): Total amount of process proc_index (int): The sequence of this process Raises: ValueError: If the stream is invalid. """ client = vineyard.connect(vineyard_socket) builder = ByteStreamBuilder(client) serialization_mode = read_options.pop('serialization_mode', False) if serialization_mode: parsed = urlparse(path) try: fs = fsspec.filesystem(parsed.scheme) except ValueError as e: report_status("error", str(e)) raise meta_file = f"{path}_{proc_index}.meta" blob_file = f"{path}_{proc_index}" if not fs.exists(meta_file) or not fs.exists(blob_file): report_status("error", f"Some serialization file cannot be found. Expected: {meta_file} and {blob_file}") raise FileNotFoundError('{}, {}'.format(meta_file, blob_file)) # Used for read bytes of serialized graph meta_file = fsspec.open(meta_file, mode="rb", **storage_options) with meta_file as f: meta = f.read().decode('utf-8') meta = json.loads(meta) lengths = meta.pop("lengths") for k, v in meta.items(): builder[k] = v stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) writer = stream.open_writer(client) of = fsspec.open(blob_file, mode="rb", **storage_options) with of as f: try: total_size = f.size() except TypeError: total_size = f.size assert total_size == sum(lengths), "Target file is corrupted" for length in lengths: buf = f.read(length) chunk = writer.next(length) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close() writer.finish() else: # Used when reading tables from external storage. # Usually for load a property graph header_row = read_options.get("header_row", False) for k, v in read_options.items(): if k in ("header_row", "include_all_columns"): builder[k] = "1" if v else "0" elif k == "delimiter": builder[k] = bytes(v, "utf-8").decode("unicode_escape") else: builder[k] = v offset = 0 chunk_size = 1024 * 1024 * 4 try: of = fsspec.open(path, mode="rb", **storage_options) except Exception as e: report_status("error", str(e)) raise with of as f: header_line = read_block(f, 0, 1, b'\n') builder["header_line"] = header_line.decode("unicode_escape") if header_row: offset = len(header_line) stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) writer = stream.open_writer(client) try: total_size = f.size() except TypeError: total_size = f.size part_size = (total_size - offset) // proc_num begin = part_size * proc_index + offset end = min(begin + part_size, total_size) if proc_index == 0: begin -= int(header_row) while begin < end: buf = read_block(f, begin, min(chunk_size, end - begin), delimiter=b"\n") size = len(buf) if not size: break begin += size - 1 chunk = writer.next(size) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close() writer.finish()
def from_json(cls, fname: Union[str, Path], open_kwargs: dict = {}): with fsspec.open(str(fname), mode='rt', **open_kwargs) as fh: jblob = json.loads(fh.read()) return cls(**jblob)
def save_speaker_mapping(out_path, speaker_mapping): """Saves speaker mapping if not yet present.""" if out_path is not None: speakers_json_path = _set_file_path(out_path) with fsspec.open(speakers_json_path, "w") as f: json.dump(speaker_mapping, f, indent=4)
def _save_json(json_file_path: str, data: dict) -> None: with fsspec.open(json_file_path, "w") as f: json.dump(data, f, indent=4)
def inspect(self, dataset, columns_dict, output_file): """ Parameters ----------- path: str, list of str, or <dask.dataframe|cudf|pd>.DataFrame Dataset path (or list of paths), or a DataFrame. If string, should specify a specific file or directory path. If this is a directory path, the directory structure must be flat (nested directories are not yet supported). dataset_format: str Dataset format (i.e parquet or csv) columns_dict: dict Dictionary indicating the different columns type output_file: str Filename to write the output statistics """ # Get dataset columns cats = columns_dict["cats"] conts = columns_dict["conts"] labels = columns_dict["labels"] # Create Dataset, Workflow, and get Stats stats = DataStats() features = ColumnSelector(cats + conts + labels) >> stats workflow = Workflow(features, client=self.client) workflow.fit(dataset) # get statistics from the datastats op output = stats.output # Dictionary to store collected information data = {} # Store num_rows data["num_rows"] = dataset.num_rows # Store cols for col_type in ["conts", "cats", "labels"]: data[col_type] = {} for col in columns_dict[col_type]: data[col_type][col] = {} data[col_type][col]["dtype"] = output[col]["dtype"] if col_type != "conts": data[col_type][col]["cardinality"] = output[col][ "cardinality"] if col_type == "cats": data[col_type][col]["min_entry_size"] = output[col]["min"] data[col_type][col]["max_entry_size"] = output[col]["max"] data[col_type][col]["avg_entry_size"] = output[col]["mean"] elif col_type == "conts": data[col_type][col]["min_val"] = output[col]["min"] data[col_type][col]["max_val"] = output[col]["max"] data[col_type][col]["mean"] = output[col]["mean"] data[col_type][col]["std"] = output[col]["std"] data[col_type][col]["per_nan"] = output[col]["per_nan"] # Write json file with fsspec.open(output_file, "w") as outfile: json.dump(data, outfile, cls=NpEncoder)
def test_automkdir_readonly(tmpdir): dir = os.path.join(str(tmpdir), "d") with pytest.raises(FileNotFoundError): of = fsspec.open(os.path.join(dir, "dfile"), "r") with of: pass
def open_file(url, *args, **kwargs): of = fsspec.open(url, *args, **kwargs) with of as f: yield f
N_EXAMPLES = 2 if __name__ == "__main__": for i, config in enumerate(Oscar.BUILDER_CONFIGS): print(f"Loading config '{config.name}' ({i + 1}/{len(Oscar.BUILDER_CONFIGS)})") # Get data url checksum_filename = _BASE_CHECKSUM_FILE_NAME.format(language=config.language) checksum_url = config.base_data_url + checksum_filename checksum_file_content = requests.get(checksum_url).text.splitlines() data_filename = checksum_file_content[0].split("\t")[0] data_url = config.base_data_url + data_filename # Get a few examples with fs.open(data_url, "rt", compression="gzip") as f: current_examples = 0 dummy_content = [] for line in f: dummy_content.append(line) current_examples += len(line.strip()) == 0 if current_examples == N_EXAMPLES: break dummy_content = "".join(dummy_content).rstrip() # Write dummy files dummy_data_dir = Path(__file__).resolve().parent / "dummy" / config.name / str(config.version) / "dummy_data" dummy_data_dir.mkdir(parents=True, exist_ok=True) (dummy_data_dir / checksum_filename).open("w").write(data_filename + "\t insert_hash_here") with fs.open(str(dummy_data_dir / data_filename), "wt", compression="gzip") as f: f.write(dummy_content)
def to_image( mols: Union[List[Chem.rdchem.Mol], Chem.rdchem.Mol], legends: Union[List[Union[str, None]], str, None] = None, n_cols: int = 4, use_svg: bool = False, mol_size: Union[Tuple[int, int], int] = (200, 200), highlight_atom: List[List[int]] = None, highlight_bond: List[List[int]] = None, outfile: str = None, max_mols: int = 32, copy: bool = False, indices: bool = False, ): """Generate an image out of a molecule or a list of molecule. Args: mols: one or a list of molecules. legends: a string or a list of string as legend for every molecules. n_cols: number of molecules per column. use_svg: whether to ouput an SVG (or a PNG). mol_size: a int or a tuple of int defining the size per molecule. highlight_atom: atom to highlight. highlight_bond: bonds to highlight. outfile: path where to save the image (local or remote path). max_mols: the maximum number of molecules to display. copy: whether to copy the molecules or not. indices: Whether to draw the atom indices. """ if isinstance(mol_size, int): mol_size = (mol_size, mol_size) if isinstance(mols, Chem.rdchem.Mol): mols = [mols] if isinstance(legends, str): legends = [legends] if copy: mols = [dm.copy_mol(mol) for mol in mols] if max_mols is not None: mols = mols[:max_mols] if legends is not None: legends = legends[:max_mols] if indices is True: [dm.atom_indices_to_mol(mol) for mol in mols] _highlight_atom = highlight_atom if highlight_atom is not None and isinstance(highlight_atom[0], int): _highlight_atom = [highlight_atom] _highlight_bond = highlight_bond if highlight_bond is not None and isinstance(highlight_bond[0], int): _highlight_bond = [highlight_bond] # Don't make the image bigger than it if len(mols) < n_cols: n_cols = len(mols) image = Draw.MolsToGridImage( mols, legends=legends, molsPerRow=n_cols, useSVG=use_svg, subImgSize=mol_size, highlightAtomLists=_highlight_atom, highlightBondLists=_highlight_bond, ) if outfile is not None: with fsspec.open(outfile, "wb") as f: if use_svg: if isinstance(image, str): # in a terminal process f.write(image.encode()) else: # in a jupyter kernel process f.write(image.data.encode()) # type: ignore else: if isinstance(image, PIL.PngImagePlugin.PngImageFile): # type: ignore # in a terminal process image.save(f) else: # in a jupyter kernel process f.write(image.data) # type: ignore return image
def makedirs(url, exist_ok=False): fs, path = get_fs_and_path(url) fs.makedirs(path, exist_ok=exist_ok) if not path_exists(url): with fsspec.open(url, mode="wb"): pass
def read_file(path: str, **kwargs): """Support fetching files from arbitrary filesystems """ with fsspec.open(path, **kwargs) as f: content = f.read() return content
def _create_checkpoint(checkpoint_stream, global_step: int, filename: str): with tune.checkpoint_dir(step=global_step) as checkpoint_dir: file_path = os.path.join(checkpoint_dir, filename) with fsspec.open(file_path, "wb") as f: f.write(checkpoint_stream)
def upload(entry, parquet_dir, url): of = fsspec.open(url) if of.fs.exists(url): of.fs.delete(url, recursive=True) entry.fs.upload(parquet_dir, url, recursive=True) return True
], 'pressure_level': [ '250', '500', '700', '850', '925', '1000', ], 'year': date.year, 'month': date.month, 'day': date.day, 'time': [ '00:00', '12:00', ], 'area' : [75.,185, 15., 320.] #N,W,S,E } r1 = c1.retrieve(name, request, None) with fsspec.open(r1.location) as f1: iso_vars = xr.open_dataset(f1, engine='scipy') # print(iso_vars) print('getting single-level data') ## also need MSLP & PW ## get the data from ECMWF API c2 = cdsapi.Client() name = 'reanalysis-era5-single-levels' request = { 'product_type': 'reanalysis', 'format': 'netcdf', 'variable': ['mean_sea_level_pressure','total_column_water_vapour'], 'year': date.year, 'month': date.month,
def _fsspec_safe_open(fname, **kwargs): # workaround for inconsistent behavior of fsspec.open # https://github.com/intake/filesystem_spec/issues/579 with fsspec.open(fname, **kwargs) as fp: with fp as fp2: yield fp2
def _load_json(json_file_path: str) -> Dict: with fsspec.open(json_file_path, "r") as f: return json.load(f)
def _get_url_size(fname): with fsspec.open(fname, mode="rb") as of: size = of.size return size
# tests for pillow-8.1.2-py38ha0e1e83_1 (this is a generated file); print('===== testing package: pillow-8.1.2-py38ha0e1e83_1 =====') print('running run_test.py') # --- run_test.py (begin) --- import fsspec from PIL import Image # Test JPEG2k with fsspec.open("https://www.fnordware.com/j2k/relax.jp2") as f: image = Image.open(f) image.load() # --- run_test.py (end) --- print('===== pillow-8.1.2-py38ha0e1e83_1 OK =====') print("import: 'PIL'") import PIL print("import: 'PIL.Image'") import PIL.Image print("import: 'PIL.ImageCms'") import PIL.ImageCms
def __init__(self, filename: str, hdf5group: str = None, hdf5file_mode: str = 'r', store: Union[MutableMapping, str, Path] = None, store_path: str = None, store_mode: str = 'a', LRU: bool = False, LRU_max_size: int = 2**30, max_chunksize=2*2**20): """ Args: filename: str or File-like object, file name string or File-like object to be read by zarr hdf5group: str, hdf5 group in hdf5 file to be read by zarr along with its children. default is the root group. hdf5file_mode str, subset of h5py file access modes, filename must exist 'r' readonly, default 'r' 'r+' read and write store: collections.abc.MutableMapping or str, zarr store. if string path is passed, zarr.DirectoryStore is created at the given path, if None, zarr.MemoryStore is used store_mode: store data access mode, default 'a' 'r' readonly, compatible zarr hierarchy should already exist in the passed store 'r+' read and write, return error if file does not exist, for updating zarr hierarchy 'w' create store, remove data if it exists 'w-' or 'x' create store, fail if exists 'a' read and write, create if it does not exist, default 'r' store_path: string, path in zarr store LRU: bool, if store is not already zarr.LRUStoreCache, add a zarr.LRUStoreCache store layer on top of currently used store LRU_max_size: int, maximum zarr.LRUStoreCache cache size, only used if store is zarr.LRUStoreCache, or LRU argument is True max_chunksize: maximum chunk size to use when creating zarr hierarchy, this is useful if only a small slice of data needs to be read """ # Verify arguments if hdf5file_mode not in ('r', 'r+'): raise ValueError("hdf5file_mode must be 'r' or 'r+'") self.hdf5file_mode = hdf5file_mode # Verify arguments if not isinstance(LRU, bool): raise TypeError(f"Expected bool for LRU, received {type(LRU)}") self.LRU = LRU if not isinstance(LRU_max_size, int): raise TypeError(f"Expected int for LRU_max_size, received {type(LRU_max_size)}") self.LRU_max_size = LRU_max_size if not isinstance(max_chunksize, int): raise TypeError(f"Expected int for max_chunksize, received {type(max_chunksize)}") self.max_chunksize = max_chunksize # store, store_path, and store_mode are passed through to zarr self.store_path = store_path self.store_mode = store_mode if store is not None and LRU is True and not isinstance(store, zarr.LRUStoreCache): self.store = zarr.LRUStoreCache(store, max_size=self.LRU_max_size) else: self.store = store # create dictionary mapping hdf5 filter numbers to compatible zarr codec self._hdf5_regfilters_subset = {} self._fill_regfilters() # dictionary to hold addresses of hdf5 objects in file self._address_dict = {} # create zarr format hierarchy for datasets and attributes compatible with hdf5 file, # dataset contents are not copied, unless it contains variable-length strings self.zgroup = zarr.open_group(self.store, mode=self.store_mode, path=self.store_path) if self.store is None: self.store = self.zgroup.store # FileChunkStore requires uri if isinstance(filename, str): self.uri = filename else: try: self.uri = getattr(filename, 'path', None) if self.uri is None: self.uri = filename.name except: self.uri = '' # Access hdf5 file and create zarr hierarchy if hdf5group is not None and not isinstance(hdf5group, str): raise TypeError(f"Expected str for hdf5group, recieved {type(hdf5group)}") self.hdf5group = hdf5group self.filename = filename if self.store_mode != 'r': self.file = h5py.File(self.filename, mode=self.hdf5file_mode) self.group = self.file[self.hdf5group] if self.hdf5group is not None else self.file self.create_zarr_hierarchy(self.group, self.zgroup) self.file.close() if isinstance(self.filename, str): self.chunkstore_file = fsspec.open(self.filename, mode='rb') self.chunk_store = FileChunkStore(self.store, chunk_source=self.chunkstore_file.open()) else: self.chunk_store = FileChunkStore(self.store, chunk_source=self.filename) if LRU is True and not isinstance(self.chunk_store, zarr.LRUStoreCache): self.chunk_store = zarr.LRUStoreCache(self.chunk_store, max_size=self.LRU_max_size) # open zarr group store_mode_cons = 'r' if self.store_mode == 'r' else 'r+' self.zgroup = zarr.open_group(self.store, mode=store_mode_cons, path=self.store_path, chunk_store=self.chunk_store)
def open_gzip(path: PathType, storage_options: Optional[Dict[str, str]]) -> IO[Any]: url = str(path) storage_options = storage_options or {} openfile: IO[Any] = fsspec.open(url, compression="gzip", **storage_options) return openfile
def generate_wb_timeseries(shapes, config_dict): """ This is where the code processing is actually done. This code takes in a polygon, and the and a config dict which contains: shapefile's crs, output directory, id_field, time_span, and include_uncertainty which says whether to include all data as well as an invalid pixel count which can be used for measuring uncertainty performs a polygon drill into the wofs_albers product. The resulting xarray, which contains the water classified pixels for that polygon over every available timestep, is used to calculate the percentage of the water body that is wet at each time step. The outputs are written to a csv file named using the polygon UID, which is a geohash of the polygon's centre coords. Inputs: shapes - polygon to be interrogated config_dict - many config settings including crs, id_field, time_span, shapefile Outputs: Nothing is returned from the function, but a csv file is written out to disk """ output_dir = config_dict['output_dir'] crs = config_dict['crs'] id_field = config_dict['id_field'] time_span = config_dict['time_span'] include_uncertainty = config_dict['include_uncertainty'] wofls = config_dict['wofls'] assert wofls # Some query parameters will be different for different WOfL products. output_res = get_resolution(wofls) dataset_maturity = get_dataset_maturity(wofls) if include_uncertainty: unknown_percent_threshold = 100 else: unknown_percent_threshold = 10 with Datacube(app='Polygon drill') as dc: first_geometry = shapes['geometry'] str_poly_name = shapes['properties'][id_field] try: fpath = os.path.join(output_dir, f'{str_poly_name[0:4]}/{str_poly_name}.csv') except TypeError: str_poly_name = str(int(str_poly_name)).zfill(6) fpath = os.path.join(output_dir, f'{str_poly_name[0:4]}/{str_poly_name}.csv') geom = geometry.Geometry(first_geometry, crs=crs) current_year = datetime.now().year if time_span == 'ALL': if shapely_geom.shape(first_geometry).envelope.area > 2000000: years = range(1986, current_year + 1, 5) time_periods = [(str(year), str(year + 4)) for year in years] else: time_periods = [('1986', str(current_year))] elif time_span == 'APPEND': start_date = get_last_date(fpath) if start_date is None: logger.debug(f'There is no csv for {str_poly_name}') return 1 time_periods = [(start_date, str(current_year))] elif time_span == 'CUSTOM': time_periods = [(config_dict['start_dt'], config_dict['end_date'])] valid_capacity_pc = [] valid_capacity_ct = [] invalid_capacity_ct = [] date_list = [] for time in time_periods: wb_capacity_pc = [] wb_capacity_ct = [] wb_invalid_ct = [] dry_observed = [] invalid_observations = [] # Set up the query, and load in all of the WOFS layers query = { 'geopolygon': geom, 'time': time, 'output_crs': crs, 'resolution': output_res, 'resampling': 'nearest' } if dataset_maturity: query['dataset_maturity'] = dataset_maturity logger.debug('Query: {}'.format( {k: v for k, v in query.items() if k != 'geopolygon'})) wofl = dc.load(product=wofls, group_by='solar_day', fuse_func=wofls_fuser, **query) if len(wofl.attrs) == 0: logger.debug( f'There is no new data for {str_poly_name} in {time}') # TODO(MatthewJA): Confirm (with Ness?) that changing this # return to a continue doesn't break things. continue # Make a mask based on the polygon (to remove extra data # outside of the polygon) mask = rasterio.features.geometry_mask( [geom.to_crs(wofl.geobox.crs) for geoms in [geom]], out_shape=wofl.geobox.shape, transform=wofl.geobox.affine, all_touched=False, invert=True) # mask the data to the shape of the polygon # the geometry width and height must both be larger than one pixel # to mask. if (geom.boundingbox.width > 25.3 and geom.boundingbox.height > 25.3): wofl_masked = wofl.water.where(mask) else: wofl_masked = wofl.water # Work out how full the waterbody is at every time step for ix, times in enumerate(wofl.time): # Grab the data for our timestep all_the_bit_flags = wofl_masked.isel(time=ix) # Find all the wet/dry pixels for that timestep lsa_wet = all_the_bit_flags.where( all_the_bit_flags == 136).count().item() lsa_dry = all_the_bit_flags.where( all_the_bit_flags == 8).count().item() sea_wet = all_the_bit_flags.where( all_the_bit_flags == 132).count().item() sea_dry = all_the_bit_flags.where( all_the_bit_flags == 4).count().item() sea_lsa_wet = all_the_bit_flags.where( all_the_bit_flags == 140).count().item() sea_lsa_dry = all_the_bit_flags.where( all_the_bit_flags == 12).count().item() wet_pixels = (all_the_bit_flags.where( all_the_bit_flags == 128).count().item() + lsa_wet + sea_wet + sea_lsa_wet) dry_pixels = (all_the_bit_flags.where( all_the_bit_flags == 0).count().item() + lsa_dry + sea_dry + sea_lsa_dry) # Count the number of masked observations masked_all = all_the_bit_flags.count().item() # Turn our counts into percents try: water_percent = round((wet_pixels / masked_all * 100), 1) dry_percent = round((dry_pixels / masked_all * 100), 1) missing_pixels = masked_all - (wet_pixels + dry_pixels) unknown_percent = missing_pixels / masked_all * 100 except ZeroDivisionError: water_percent = 0.0 dry_percent = 0.0 unknown_percent = 100.0 missing_pixels = masked_all logger.debug(f'{str_poly_name} has divide by zero error') # Append the percentages to a list for each timestep # Filter out timesteps with < 90% valid observations. Add # empty values for timesteps with < 90% valid. if you set # 'UNCERTAINTY = True' in your config file then you will # only filter out timesteps with 100% invalid pixels. # You will also record the number invalid pixels per timestep. if unknown_percent < unknown_percent_threshold: wb_capacity_pc.append(water_percent) invalid_observations.append(unknown_percent) wb_invalid_ct.append(missing_pixels) dry_observed.append(dry_percent) wb_capacity_ct.append(wet_pixels) else: wb_capacity_pc.append('') invalid_observations.append('') wb_invalid_ct.append('') dry_observed.append('') wb_capacity_ct.append('') valid_obs = wofl.time.dropna(dim='time') valid_obs = valid_obs.to_dataframe() if 'spatial_ref' in valid_obs.columns: valid_obs = valid_obs.drop(columns=['spatial_ref']) valid_capacity_pc += wb_capacity_pc valid_capacity_ct += wb_capacity_ct invalid_capacity_ct += wb_invalid_ct date_list += valid_obs.to_csv( None, header=False, index=False, date_format="%Y-%m-%dT%H:%M:%SZ").split('\n') date_list.pop() if date_list: if include_uncertainty: rows = zip(date_list, valid_capacity_pc, valid_capacity_ct, invalid_capacity_ct) else: rows = zip(date_list, valid_capacity_pc, valid_capacity_ct) os.makedirs(os.path.dirname(fpath), exist_ok=True) if time_span == 'APPEND': of = fsspec.open(fpath, 'a') with of as f: writer = csv.writer(f) for row in rows: writer.writerow(row) else: of = fsspec.open(fpath, 'w') with of as f: writer = csv.writer(f) headings = [ 'Observation Date', 'Wet pixel percentage', 'Wet pixel count (n = {0})'.format(masked_all) ] if include_uncertainty: headings.append('Invalid pixel count') writer.writerow(headings) for row in rows: writer.writerow(row) else: logger.info(f'{str_poly_name} has no new good valid data') return True
def to_json(self, fname: Union[str, Path], open_kwargs: dict = {}) -> int: jblob = json.dumps(asdict(self)) with fsspec.open(str(fname), mode='wt', **open_kwargs) as fh: result = fh.write(jblob) return result
def download(url, csv_path): of = fsspec.open(url) of.fs.download(url, csv_path) return csv_path
def save_csv(suffix: str, contents: bytes) -> str: fd, tmpfile = tempfile.mkstemp(prefix="csv_sniffer", suffix=suffix) os.close(fd) with fsspec.open(tmpfile, mode="wb", compression="infer") as out: out.write(contents) return tmpfile
def bus_peak_frequencies( gtfs_path: str, test_date: typing.Optional[datetime.date] = None, am_peak: typing.Optional[typing.Tuple[int, int]] = None, pm_peak: typing.Optional[typing.Tuple[int, int]] = None, ) -> geopandas.GeoDataFrame: """ Compute AM and PM Peak frequencies for all the lines in a GTFS Feed. Parameters ========== gtfs_path: str The path (or URL) to a GTFS feed. test_date: datetime.date The test date for which to compute frequencies. Defaults to February 18th, 2020, an unremarkable weekday February. am_peak: tuple of integers The two hours (out of 24) demarcating the AM peak period. pm_peak: tuple of integers The two hours (out of 24) demarcating the PM peak period. """ # Set default values test_date = test_date or TEST_DATE am_peak = am_peak or (6, 9) pm_peak = pm_peak or (15, 19) am_duration = am_peak[1] - am_peak[0] pm_duration = pm_peak[1] - pm_peak[0] assert am_duration > 0 assert pm_duration > 0 # Download and read the GTFS feed with fsspec.open(gtfs_path, "rb") as infile: data = infile.read() with open(GTFS_FILE, "wb") as outfile: outfile.write(data) service_by_date = partridge.read_service_ids_by_date(GTFS_FILE) feed = partridge.load_geo_feed(GTFS_FILE) # Get the service for the test date try: test_service = next(v for k, v in service_by_date.items() if k == test_date) except StopIteration: raise ValueError(f"Could not find service for {test_date}") test_trips = feed.trips[feed.trips.service_id.isin(test_service)] test_stops = feed.stop_times[feed.stop_times.trip_id.isin( test_trips.trip_id)] # Get the departure, arrival, and mean time for each trip trip_timings = test_stops.groupby(test_stops.trip_id).agg({ "departure_time": min, "arrival_time": max }) trip_timings = trip_timings.assign( mean_time=trip_timings.departure_time + (trip_timings.arrival_time - trip_timings.departure_time) / 2.0) # Find all of the trips that fall within the AM and PM peak times. am_peak_trips = trip_timings[ (trip_timings.mean_time > am_peak[0] * 60 * 60) & (trip_timings.mean_time < am_peak[1] * 60 * 60)] pm_peak_trips = trip_timings[ (trip_timings.mean_time > pm_peak[0] * 60 * 60) & (trip_timings.mean_time < pm_peak[1] * 60 * 60)] am_peak_trips = test_trips.merge( am_peak_trips, left_on=test_trips.trip_id, right_index=True, ) pm_peak_trips = test_trips.merge( pm_peak_trips, left_on=test_trips.trip_id, right_index=True, ) # Compute the peak frequency am_peak_frequency = (am_peak_trips.groupby( [am_peak_trips.route_id, am_peak_trips.direction_id]).size().to_frame("am_peak_trips")) am_peak_frequency = am_peak_frequency.assign( am_peak_frequency=am_duration * 60 / am_peak_frequency.am_peak_trips) pm_peak_frequency = (pm_peak_trips.groupby( [pm_peak_trips.route_id, pm_peak_trips.direction_id]).size().to_frame("pm_peak_trips")) pm_peak_frequency = pm_peak_frequency.assign( pm_peak_frequency=pm_duration * 60 / pm_peak_frequency.pm_peak_trips) peak_frequency = pandas.concat([am_peak_frequency, pm_peak_frequency], axis=1, sort=False) # Add the route short name for easier legibility. peak_frequency = peak_frequency.join( feed.routes[["route_id", "route_short_name"]].set_index("route_id"), how="left", on="route_id", ) # Grab the most popular shape as the official one. route_shapes = (test_trips.groupby("route_id").agg({ "shape_id": lambda s: s.value_counts().index[0] }).reset_index().merge( feed.shapes, how="left", on="shape_id").set_index("route_id").drop(columns=["shape_id"])) peak_frequency = peak_frequency.merge( route_shapes, how="left", right_index=True, left_index=True).assign(agency=feed.agency.agency_name.iloc[0]) gdf = geopandas.GeoDataFrame(peak_frequency, geometry="geometry") gdf.crs = f"EPSG:{WGS84}" return gdf
def write_text_from_href(self, href: str, txt: str, *args: Any, **kwargs: Any) -> None: with fsspec.open(href, "w") as destination: return destination.write(txt)
def open_file(urlpath, mode="rb", compression=None): return fsspec.open(urlpath, mode=mode, compression=compression, **fsspec_kwargs)