class MyAlgorithm(Algorithm): x = NodeTrait().tag(attr=True) y = NodeTrait().tag(attr=True) outputs = ["sum", "prod", "diff"] def algorithm(self, inputs): sum_ = inputs["x"] + inputs["y"] prod = inputs["x"] * inputs["y"] diff = inputs["x"] - inputs["y"] return np.stack([sum_, prod, diff], -1)
class UnaryAlgorithm(BaseAlgorithm): """ Base class for computation nodes that take a single source and transform it. Attributes ---------- source : Node The source node Notes ------ Developers of new Algorithm nodes need to implement the `eval` method. """ source = NodeTrait().tag(attr=True,required=True) # list of attribute names, used by __repr__ and __str__ to display minimal info about the node _repr_keys = ["source"] @tl.default("outputs") def _default_outputs(self): return self.source.outputs @tl.default("style") def _default_style(self): # Pass through source style by default return self.source.style
class Reproject(Interpolate): """ Create a Algorithm that evalutes a Node with one set of coordinates, and then interpolates it. This can be used to bilinearly interpolate an averaged dataset, for example. Attributes ---------- source : Node The source node. This node will use it's own, specified interpolation scheme interpolation : str Type of interpolation method to use for the interpolation coordinates: Coordinates, Node, str, dict Coordinates used to evaluate the source. These can be specified as a dictionary, json-formatted string, PODPAC Coordinates, or a PODPAC Node, where the node MUST implement the 'coordinates' attribute. """ coordinates = tl.Union( [NodeTrait(), tl.Dict(), tl.Unicode(), tl.Instance(Coordinates)], help= """Coordinates used to evaluate the source. These can be specified as a dictionary, json-formatted string, PODPAC Coordinates, or a PODPAC Node, where the node MUST implement the 'coordinates' attribute""", ).tag(attr=True) @tl.validate("coordinates") def _validate_coordinates(self, d): if isinstance(d["value"], Node) and not hasattr(d["value"], "coordinates"): raise ValueError( "When specifying the coordinates as a PODPAC Node, this Node must have a 'coordinates' attribute" ) return d["value"] @property def _coordinates(self): if isinstance(self.coordinates, Coordinates): return self.coordinates elif isinstance(self.coordinates, Node): return self.coordinates.coordinates elif isinstance(self.coordinates, dict): return Coordinates.from_definition(self.coordinates) elif isinstance(self.coordinates, string_types): return Coordinates.from_json(self.coordinates) else: raise TypeError("The coordinates attribute is of the wrong type.") def _source_eval(self, coordinates, selector, output=None): return self.source.eval(self._coordinates, output=output, _selector=selector) @property def base_ref(self): return "{}_reprojected".format(self.source.base_ref)
class Process(Node): """ Source node will be evaluated in another process, and it is blocking! """ source = NodeTrait().tag(attr=True) output_format = tl.Dict(None, allow_none=True).tag(attr=True) timeout = tl.Int(None, allow_none=True) block = tl.Bool(True) @property def outputs(self): return self.source.outputs def eval(self, coordinates, **kwargs): output = kwargs.get("output") definition = self.source.json coords = coordinates.json q = Queue() process = mpProcess(target=_f, args=(definition, coords, q, self.output_format)) process.daemon = True _log.debug("Starting process.") process.start() _log.debug("Retrieving data from queue.") o = q.get(timeout=self.timeout, block=self.block) _log.debug("Joining.") process.join() # This is blocking! _log.debug("Closing.") if (sys.version_info.major + sys.version_info.minor / 10.0) >= 3.7: process.close() # New in version Python 3.7 if isinstance(o, str): raise Exception(o) if o is None: return o._pp_deserialize() if output is not None: output[:] = o.data[:] else: output = o return output
class MyAlgorithm(BaseAlgorithm): x = NodeTrait().tag(attr=True) y = NodeTrait().tag(attr=True)
class MyNode(Node): my_attr = NodeTrait().tag(attr=True)
class Mask(Algorithm): """ Masks the `source` based on a boolean expression involving the `mask` (i.e. source[mask <bool_op> <bool_val> ] = <masked_val>). For a normal boolean mask input, default values for `bool_op`, `bool_val` and `masked_val` can be used. Attributes ---------- source : podpac.Node The source that will be masked mask : podpac.Node The data that will be used to compute the mask masked_val : float, optional Default value is np.nan. The value that will replace the masked items. bool_val : float, optional Default value is 1. The value used to compare the mask when creating the boolean expression bool_op : enum, optional Default value is '=='. One of ['==', '<', '<=', '>', '>='] in_place : bool, optional Default is False. If True, the source array will be changed in-place, which could affect the value of the source in other parts of the pipeline. Examples ---------- # Mask data from a boolean data node using the default behavior. # Create a boolean masked Node (as an example) b = Arithmetic(A=SinCoords(), eqn='A>0) # Create the source node a = Arange() masked = Mask(source=a, mask=b) # Create a node that make the following substitution "a[b > 0] = np.nan" a = Arange() b = SinCoords() masked = Mask(source=a, mask=b, masked_val=np.nan, bool_val=0, bool_op='>' in_place=True) """ source = NodeTrait().tag(attr=True, required=True) mask = NodeTrait().tag(attr=True, required=True) masked_val = tl.Float(allow_none=True, default_value=None).tag(attr=True) bool_val = tl.Float(1).tag(attr=True) bool_op = tl.Enum(["==", "<", "<=", ">", ">="], default_value="==").tag(attr=True) in_place = tl.Bool(False).tag(attr=True) _repr_keys = ["source", "mask"] def algorithm(self, inputs, coordinates): """ Sets the values in inputs['source'] to self.masked_val using (inputs['mask'] <self.bool_op> <self.bool_val>) Attributes ---------- inputs : dict Evaluated outputs of the input nodes. The keys are the attribute names. coordinates : podpac.Coordinates Requested coordinates. Note that the ``inputs`` may contain with different coordinates. Returns ------- result : UnitsDataArray Algorithm result. """ # shorter names mask = inputs["mask"] source = inputs["source"] op = self.bool_op bv = self.bool_val # Make a copy if we don't want to change the source in-place if not self.in_place: source = source.copy() # Make the mask boolean if op == "==": mask = mask == bv elif op == "<": mask = mask < bv elif op == "<=": mask = mask <= bv elif op == ">": mask = mask > bv elif op == ">=": mask = mask >= bv # Mask the values and return if self.masked_val is None: source.set(np.nan, mask) else: source.set(self.masked_val, mask) return source
class MyClass(tl.HasTraits): node = NodeTrait()
class ReprojectedSource(DataSource): """Create a DataSource with a different resolution from another Node. This can be used to bilinearly interpolated a dataset after averaging over a larger area. Attributes ---------- source : Node The source node source_interpolation : str Type of interpolation method to use for the source node reprojected_coordinates : :class:`podpac.Coordinates` Coordinates where the source node should be evaluated. """ source = NodeTrait().tag(attr=True, required=True) source_interpolation = InterpolationTrait().tag(attr=True) reprojected_coordinates = tl.Instance(Coordinates).tag(attr=True, required=True) # list of attribute names, used by __repr__ and __str__ to display minimal info about the node _repr_keys = ["source", "interpolation"] def _first_init(self, **kwargs): warnings.warn( "ReprojectedSource has been replaced by the Reproject algorithm node " "and will be removed in a future version of podpac.", DeprecationWarning, ) if "reprojected_coordinates" in kwargs: if isinstance(kwargs["reprojected_coordinates"], dict): kwargs[ "reprojected_coordinates"] = Coordinates.from_definition( kwargs["reprojected_coordinates"]) elif isinstance(kwargs["reprojected_coordinates"], string_types): kwargs["reprojected_coordinates"] = Coordinates.from_json( kwargs["reprojected_coordinates"]) return super(ReprojectedSource, self)._first_init(**kwargs) @cached_property def eval_source(self): if self.source_interpolation is not None and not self.source.has_trait( "interpolation"): _logger.warning( "ReprojectedSource cannot set the 'source_interpolation'" " since 'source' does not have an 'interpolation' " " trait. \n type(source): %s\nsource: %s" % (str(type(self.source)), str(self.source))) source = self.source if (self.source_interpolation is not None and self.source.has_trait("interpolation") and self.source_interpolation != self.source.interpolation): source = copy.deepcopy(source) source.set_trait("interpolation", self.source_interpolation) return source @common_doc(COMMON_DATA_DOC) def get_coordinates(self): """{get_coordinates}""" # cannot guarantee that coordinates exist if not isinstance(self.source, DataSource): return self.reprojected_coordinates sc = self.source.coordinates rc = self.reprojected_coordinates return Coordinates( [ rc[dim] if dim in rc.dims else self.source.coordinates[dim] for dim in self.source.coordinates.dims ], validate_crs=False, ) @common_doc(COMMON_DATA_DOC) def get_data(self, coordinates, coordinates_index): """{get_data}""" data = self.eval_source.eval(coordinates) # The following is needed in case the source is an algorithm # or compositor node that doesn't have all the dimensions of # the reprojected coordinates # TODO: What if data has coordinates that reprojected_coordinates doesn't have keep_dims = list(data.coords.keys()) drop_dims = [d for d in coordinates.dims if d not in keep_dims] coordinates.drop(drop_dims) return data @property def base_ref(self): return "{}_reprojected".format(self.source.base_ref)
class BaseCompositor(Node): """A base class for compositor nodes. Attributes ---------- sources : list Source nodes. source_coordinates : :class:`podpac.Coordinates` Coordinates that make each source unique. Must the same size as ``sources`` and single-dimensional. Optional. multithreading : bool, optional Default is False. If True, will always evaluate the compositor in serial, ignoring any MULTITHREADING settings Notes ----- Developers of compositor subclasses nodes need to implement the `composite` method. Multitheading:: * When MULTITHREADING is False, the compositor stops evaluated sources once the output is completely filled. * When MULTITHREADING is True, the compositor must evaluate every source. The result is the same, but note that because of this, disabling multithreading could sometimes be faster, especially if the number of threads is low. * NASA data servers seem to have a hard limit of 10 simultaneous requests, so a max of 10 threads is recommend for most use-cases. """ sources = tl.List(trait=NodeTrait()).tag(attr=True, required=True) source_coordinates = tl.Instance(Coordinates, allow_none=True, default_value=None).tag(attr=True) multithreading = tl.Bool(False) @tl.default("multithreading") def _default_multithreading(self): return settings["MULTITHREADING"] dims = tl.List(trait=Dimension()).tag(attr=True) auto_outputs = tl.Bool(False) # debug traits _eval_sources = tl.Any() @tl.validate("sources") def _validate_sources(self, d): sources = d["value"] n = np.sum([source.outputs is None for source in sources]) if not (n == 0 or n == len(sources)): raise ValueError( "Cannot composite standard sources with multi-output sources. " "The sources must all be standard single-output nodes or all multi-output nodes." ) return sources @tl.validate("source_coordinates") def _validate_source_coordinates(self, d): if d["value"] is None: return None if d["value"].ndim != 1: raise ValueError( "Invalid source_coordinates, invalid ndim (%d != 1)" % d["value"].ndim) if d["value"].size != len(self.sources): raise ValueError( "Invalid source_coordinates, source and source_coordinates size mismatch (%d != %d)" % (d["value"].size, len(self.sources))) return d["value"] @tl.default("outputs") def _default_outputs(self): if not self.auto_outputs: return None # autodetect outputs from sources if all(source.outputs is None for source in self.sources): outputs = None elif all(source.outputs is not None and source.output is None for source in self.sources): outputs = [] for source in self.sources: for output in source.outputs: if output not in outputs: outputs.append(output) if len(outputs) == 0: outputs = None else: raise RuntimeError( "Compositor sources were not validated correctly. " "Cannot composite standard sources with multi-output sources.") return outputs def select_sources(self, coordinates, _selector=None): """Select and prepare sources based on requested coordinates. Parameters ---------- coordinates : :class:`podpac.Coordinates` Coordinates to evaluate at compositor sources _selector : :class:`podpac.core.interpolation.selectors.Selector` Selector used to sub-select sources based on the interpolation scheme Returns ------- sources : :class:`np.ndarray` Array of sources Notes ----- * If :attr:`source_coordinates` is defined, only sources that intersect the requested coordinates are selected. """ # select intersecting sources, if possible if self.source_coordinates is None: sources = self.sources else: try: if _selector is not None: _, I = _selector(self.source_coordinates, coordinates, index_type="numpy") else: _, I = self.source_coordinates.intersect(coordinates, outer=True, return_index=True) except: # Likely non-monotonic coordinates _, I = self.source_coordinates.intersect(coordinates, outer=False, return_index=True) i = I[0] sources = np.array(self.sources)[i].tolist() return sources def composite(self, coordinates, data_arrays, result=None): """Implements the rules for compositing multiple sources together. Must be implemented by child classes. Parameters ---------- coordinates : :class:`podpac.Coordinates` {requested_coordinates} data_arrays : generator Evaluated data, in the same order as the sources. Yields a UnitsDataArray. result : UnitDataArray, optional An optional pre-filled array may be supplied, otherwise the output will be allocated. Returns ------- {eval_return} """ raise NotImplementedError() def iteroutputs(self, coordinates, _selector=None): """Summary Parameters ---------- coordinates : :class:`podpac.Coordinates` Coordinates to evaluate at compositor sources Yields ------ :class:`podpac.core.units.UnitsDataArray` Output from source node eval method """ # get sources, potentially downselected sources = self.select_sources(coordinates, _selector) if settings["DEBUG"]: self._eval_sources = sources if len(sources) == 0: yield self.create_output_array(coordinates) return if self.multithreading: n_threads = thread_manager.request_n_threads(len(sources)) if n_threads == 1: thread_manager.release_n_threads(n_threads) else: n_threads = 0 if self.multithreading and n_threads > 1: # evaluate nodes in parallel using thread pool self._multi_threaded = True pool = thread_manager.get_thread_pool(processes=n_threads) outputs = pool.map( lambda src: src.eval(coordinates, _selector=_selector), sources) pool.close() thread_manager.release_n_threads(n_threads) for output in outputs: yield output else: # evaluate nodes serially self._multi_threaded = False for src in sources: yield src.eval(coordinates, _selector=_selector) @common_doc(COMMON_COMPOSITOR_DOC) def eval(self, coordinates, **kwargs): """ Wraps the super Node.eval method in order to cache with the correct coordinates. The output is independent of any extra dimensions, so this removes extra dimensions before caching in the super eval method. """ super_coordinates = coordinates # remove extra dimensions if self.dims: extra = [ c.name for c in coordinates.values() if (isinstance(c, Coordinates1d) and c.name not in self.dims) or (isinstance(c, StackedCoordinates) and all( dim not in self.dims for dim in c.dims)) ] super_coordinates = super_coordinates.drop(extra) # note: super().eval (not self._eval) output = super().eval(super_coordinates, **kwargs) if settings["DEBUG"]: self._requested_coordinates = coordinates return output @common_doc(COMMON_COMPOSITOR_DOC) def _eval(self, coordinates, output=None, _selector=None): """Evaluates this nodes using the supplied coordinates. Parameters ---------- coordinates : :class:`podpac.Coordinates` {requested_coordinates} output : podpac.UnitsDataArray, optional {eval_output} _selector: callable(coordinates, request_coordinates) {eval_selector} Returns ------- {eval_return} """ self._evaluated_coordinates = coordinates outputs = self.iteroutputs(coordinates, _selector) output = self.composite(coordinates, outputs, output) return output def find_coordinates(self): """ Get the available coordinates for the Node. Returns ------- coords_list : list available coordinates from all of the sources. """ return [ coords for source in self.sources for coords in source.find_coordinates() ] @property def _repr_keys(self): """list of attribute names, used by __repr__ and __str__ to display minimal info about the node""" keys = [] if self.trait_is_defined("sources"): keys.append("sources") return keys
class Parallel(Node): """ This class launches the parallel node evaluations in separate threads. As such, the node does not need to return immediately (i.e. does NOT have to be asynchronous). For asynchronous nodes (i.e. aws.Lambda with download_result=False) use ParrallelAsync Attributes ----------- chunks: dict Dictionary of dimensions and sizes that will be iterated over. If a dimension is not in this dictionary, the size of the eval coordinates will be used for the chunk. In this case, it may not be possible to automatically set the coordinates of missing dimensions in the final file. fill_output: bool Default is True. When True, the final results will be assembled and returned to the user. If False, the final results should be written to a file by specifying the output_format in a Process or Lambda node. See note below. source: podpac.Node The source dataset for the computation number_of_workers: int Default is 1. Number of parallel process workers at one time. start_i: int, optional Default is 0. Starting chunk. This allow you to restart a run without having to check/submit 1000's of workers before getting back to where you were. Empty chunks make the submission slower. Notes ------ In some cases where the input and output coordinates of the source node is not the same (such as reduce nodes) and fill_output is True, the user may need to specify 'output' as part of the eval call. """ _repr_keys = ["source", "number_of_workers", "chunks"] source = NodeTrait().tag(attr=True) chunks = tl.Dict().tag(attr=True) fill_output = tl.Bool(True).tag(attr=True) number_of_workers = tl.Int(1).tag(attr=True) _lock = Lock() errors = tl.List() start_i = tl.Int(0) def eval(self, coordinates, **kwargs): output = kwargs.get("output") # Make a thread pool to manage queue pool = ThreadPool(processes=self.number_of_workers) if output is None and self.fill_output: output = self.create_output_array(coordinates) shape = [] for d in coordinates.dims: if d in self.chunks: shape.append(self.chunks[d]) else: shape.append(coordinates[d].size) results = [] # inputs = [] i = 0 for coords, slc in coordinates.iterchunks(shape, True): # inputs.append(coords) if i < self.start_i: _log.debug( "Skipping {} since it is less than self.start_i ({})". format(i, self.start_i)) i += 1 continue out = None if self.fill_output and output is not None: out = output[slc] with self._lock: _log.debug("Added {} to worker pool".format(i)) _log.debug("Node eval with coords: {}, {}".format(slc, coords)) results.append( pool.apply_async(self.eval_source, [coords, slc, out, i])) i += 1 _log.info("Added all chunks to worker pool. Now waiting for results.") start_time = time.time() for i, res in enumerate(results): # _log.debug('Waiting for results: {} {}'.format(i, inputs[i])) dt = str( np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) _log.info("({}): Waiting for results: {} / {}".format( dt, i + 1, len(results))) # Try to get the results / wait for the results try: o, slc = res.get() except Exception as e: o = None slc = None self.errors.append((i, res, e)) dt = str( np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) _log.warning("({}) {} failed with exception {}".format( dt, i, e)) dt = str( np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) _log.info("({}) Finished result: {} / {}".format( time.time() - start_time, i + 1, len(results))) # Fill output if self.fill_output: if output is None: missing_dims = [ d for d in coordinates.dims if d not in self.chunks.keys() ] coords = coordinates.drop(missing_dims) missing_coords = Coordinates.from_xarray(o).drop( list(self.chunks.keys())) coords = merge_dims([coords, missing_coords]) coords = coords.transpose(*coordinates.dims) output = self.create_output_array(coords) output[slc] = o _log.info("Completed parallel execution.") pool.close() return output def eval_source(self, coordinates, coordinates_index, out, i, source=None): if source is None: source = self.source # Make a copy to prevent any possibility of memory corruption source = Node.from_definition(source.definition) _log.info("Submitting source {}".format(i)) return (source.eval(coordinates, output=out), coordinates_index)
class ZarrOutputMixin(tl.HasTraits): """ This class assumes that the node has a 'output_format' attribute (currently the "Lambda" Node, and the "Process" Node) Attributes ----------- zarr_file: str Path to the output zarr file that collects all of the computed results. This can reside on S3. dataset: ZarrGroup A handle to the zarr group pointing to the output file fill_output: bool, optional Default is False (unlike parent class). If True, will collect the output data and return it as an xarray. init_file_mode: str, optional Default is 'w'. Mode used for initializing the zarr file. zarr_chunks: dict Size of the chunks in the zarr file for each dimension zarr_shape: dict, optional Default is the {coordinated.dims: coordinates.shape}, where coordinates used as part of the eval call. This does not need to be specified unless the Node modifies the input coordinates (as part of a Reduce operation, for example). The result can be incorrect and requires care/checking by the user. zarr_coordinates: podpac.Coordinates, optional Default is None. If the node modifies the shape of the input coordinates, this allows users to set the coordinates in the output zarr file. This can be incorrect and requires care by the user. skip_existing: bool Default is False. If true, this will check to see if the results already exist. And if so, it will not submit a job for that particular coordinate evaluation. This assumes self.chunks == self.zar_chunks list_dir: bool, optional Default is False. If skip_existing is True, by default existing files are checked by asking for an 'exists' call. If list_dir is True, then at the first opportunity a "list_dir" is performed on the directory and the results are cached. """ zarr_file = tl.Unicode().tag(attr=True) dataset = tl.Any() zarr_node = NodeTrait() zarr_data_key = tl.Union([tl.Unicode(), tl.List()]) fill_output = tl.Bool(False) init_file_mode = tl.Unicode("a").tag(attr=True) zarr_chunks = tl.Dict(default_value=None, allow_none=True).tag(attr=True) zarr_shape = tl.Dict(allow_none=True, default_value=None).tag(attr=True) zarr_coordinates = tl.Instance(Coordinates, allow_none=True, default_value=None).tag(attr=True) zarr_dtype = tl.Unicode("f4") skip_existing = tl.Bool(True).tag(attr=True) list_dir = tl.Bool(False) _list_dir = tl.List(allow_none=True, default_value=[]) _shape = tl.Tuple() _chunks = tl.List() aws_client_kwargs = tl.Dict() aws_config_kwargs = tl.Dict() def eval(self, coordinates, **kwargs): output = kwargs.get("output") if self.zarr_shape is None: self._shape = coordinates.shape else: self._shape = tuple(self.zarr_shape.values()) # initialize zarr file if self.zarr_chunks is None: chunks = [self.chunks[d] for d in coordinates] else: chunks = [self.zarr_chunks[d] for d in coordinates] self._chunks = chunks zf, data_key, zn = self.initialize_zarr_array(self._shape, chunks) self.dataset = zf self.zarr_data_key = data_key self.zarr_node = zn zn.keys # eval _log.debug("Starting parallel eval.") missing_dims = [ d for d in coordinates.dims if d not in self.chunks.keys() ] if self.zarr_coordinates is not None: missing_dims = missing_dims + [ d for d in self.zarr_coordinates.dims if d not in missing_dims ] set_coords = merge_dims( [coordinates.drop(missing_dims), self.zarr_coordinates]) else: set_coords = coordinates.drop(missing_dims) set_coords.transpose(*coordinates.dims) self.set_zarr_coordinates(set_coords, data_key) if self.list_dir: dk = data_key if isinstance(dk, list): dk = dk[0] self._list_dir = self.zarr_node.list_dir(dk) output = super(ZarrOutputMixin, self).eval(coordinates, output=output) # fill in the coordinates, this is guaranteed to be correct even if the user messed up. if output is not None: self.set_zarr_coordinates(Coordinates.from_xarray(output), data_key) else: return zf return output def set_zarr_coordinates(self, coordinates, data_key): # Fill in metadata for dk in data_key: self.dataset[dk].attrs["_ARRAY_DIMENSIONS"] = coordinates.dims for d in coordinates.dims: # TODO ADD UNITS AND TIME DECODING INFORMATION self.dataset.create_dataset(d, shape=coordinates[d].size, overwrite=True) self.dataset[d][:] = coordinates[d].coordinates def initialize_zarr_array(self, shape, chunks): _log.debug("Creating Zarr file.") zn = Zarr(source=self.zarr_file, file_mode=self.init_file_mode, aws_client_kwargs=self.aws_client_kwargs) if self.source.output or getattr(self.source, "data_key", None): data_key = self.source.output if data_key is None: data_key = self.source.data_key if not isinstance(data_key, list): data_key = [data_key] elif self.source.outputs: # If someone restricted the outputs for this node, we need to know data_key = [dk for dk in data_key if dk in self.source.outputs] elif self.source.outputs: data_key = self.source.outputs else: data_key = ["data"] zf = zarr.open(zn._get_store(), mode=self.init_file_mode) # Intialize the output zarr arrays for dk in data_key: try: arr = zf.create_dataset( dk, shape=shape, chunks=chunks, fill_value=np.nan, dtype=self.zarr_dtype, overwrite=not self.skip_existing, ) except ValueError: pass # Dataset already exists # Recompute any cached properties zn = Zarr(source=self.zarr_file, file_mode=self.init_file_mode, aws_client_kwargs=self.aws_client_kwargs) return zf, data_key, zn def eval_source(self, coordinates, coordinates_index, out, i, source=None): if source is None: source = self.source if self.skip_existing: # This section allows previously computed chunks to be skipped dk = self.zarr_data_key if isinstance(dk, list): dk = dk[0] try: exists = self.zarr_node.chunk_exists(coordinates_index, data_key=dk, list_dir=self._list_dir, chunks=self._chunks) except ValueError as e: # This was needed in cases where a poor internet connection caused read errors exists = False if exists: _log.info("Skipping {} (already exists)".format(i)) return out, coordinates_index # Make a copy to prevent any possibility of memory corruption source = Node.from_definition(source.definition) _log.debug("Creating output format.") output = dict( format="zarr_part", format_kwargs=dict( part=[[s.start, min(s.stop, self._shape[i]), s.step] for i, s in enumerate(coordinates_index)], source=self.zarr_file, mode="a", ), ) _log.debug("Finished creating output format.") if source.has_trait("output_format"): source.set_trait("output_format", output) _log.debug("output: {}, coordinates.shape: {}".format( output, coordinates.shape)) _log.debug("Evaluating node.") o, slc = super(ZarrOutputMixin, self).eval_source(coordinates, coordinates_index, out, i, source) if not source.has_trait("output_format"): o.to_format(output["format"], **output["format_kwargs"]) return o, slc
class ParallelAsync(Parallel): """ This class launches the parallel node evaluations in threads up to n_workers, and expects the node.eval to return quickly for parallel execution. This Node was written with aws.Lambda(eval_timeout=1.25<small>) Nodes in mind. Users can implement the `check_worker_available` method or specify the `no_worker_exception` attribute, which is an exception thrown if workers are not available. Attributes ----------- chunks: dict Dictionary of dimensions and sizes that will be iterated over. If a dimension is not in this dictionary, the size of the eval coordinates will be used for the chunk. In this case, it may not be possible to automatically set the coordinates of missing dimensions in the final file. fill_output: bool Default is True. When True, the final results will be assembled and returned to the user. If False, the final results should be written to a file by specifying the output_format in a Process or Lambda node. See note below. source: podpac.Node The source dataset for the computation sleep_time: float Default is 1 second. Number of seconds to sleep between trying to submit new workers no_worker_exception: Exception, optional Default is .Exception class used to identify when a submission failed due to no available workers. The default is chosen to work with the podpac.managers.Lambda node. async_exception: Exception Default is botocore.exceptions.ReadTimeoutException. This is an exception thrown by the async function in case it time out waiting for a return. In our case, this is a success. The default is chosen to work with the podpac.managers.Lambda node. Notes ------ In some cases where the input and output coordinates of the source node is not the same (such as reduce nodes) and fill_output is True, the user may need to specify 'output' as part of the eval call. """ source = NodeTrait().tag(attr=True) chunks = tl.Dict().tag(attr=True) fill_output = tl.Bool(True).tag(attr=True) sleep_time = tl.Float(1).tag(attr=True) no_worker_exception = tl.Type( botocore.exceptions.ClientError).tag(attr=True) async_exception = tl.Type( botocore.exceptions.ReadTimeoutError).tag(attr=True) def check_worker_available(self): return True def eval_source(self, coordinates, coordinates_index, out, i, source=None): if source is None: source = self.source # Make a copy to prevent any possibility of memory corruption source = Node.from_definition(source.definition) success = False o = None while not success: if self.check_worker_available(): try: o = source.eval(coordinates, output=out) success = True except self.async_exception: # This exception is fine and constitutes a success o = None success = True except self.no_worker_exception as e: response = e.response if not (response and response.get("Error", {}).get("Code") == "TooManyRequestsException"): raise e # Raise error again, not the right error _log.debug("Worker {} exception {}".format(i, e)) success = False time.sleep(self.sleep_time) else: _log.debug("Worker unavailable for {}".format(i, e)) time.sleep(self.sleep_time) _log.info("Submitting source {}".format(i)) return (o, coordinates_index)
class Interpolate(Node): """Node to used to interpolate from self.source.coordinates to the user-specified, evaluated coordinates. Parameters ---------- source : Any The source node which will be interpolated interpolation : str, dict, optional Interpolation definition for the data source. By default, the interpolation method is set to ``'nearest'`` for all dimensions. If input is a string, it must match one of the interpolation shortcuts defined in :attr:`podpac.data.INTERPOLATION_SHORTCUTS`. The interpolation method associated with this string will be applied to all dimensions at the same time. If input is a dict or list of dict, the dict or dict elements must adhere to the following format: The key ``'method'`` defining the interpolation method name. If the interpolation method is not one of :attr:`podpac.data.INTERPOLATION_SHORTCUTS`, a second key ``'interpolators'`` must be defined with a list of :class:`podpac.interpolators.Interpolator` classes to use in order of uages. The dictionary may contain an option ``'params'`` key which contains a dict of parameters to pass along to the :class:`podpac.interpolators.Interpolator` classes associated with the interpolation method. The dict may contain the key ``'dims'`` which specifies dimension names (i.e. ``'time'`` or ``('lat', 'lon')`` ). If the dictionary does not contain a key for all unstacked dimensions of the source coordinates, the :attr:`podpac.data.INTERPOLATION_DEFAULT` value will be used. All dimension keys must be unstacked even if the underlying coordinate dimensions are stacked. Any extra dimensions included but not found in the source coordinates will be ignored. The dict may contain a key ``'params'`` that can be used to configure the :class:`podpac.interpolators.Interpolator` classes associated with the interpolation method. If input is a :class:`podpac.data.Interpolation` class, this Interpolation class will be used without modification. cache_output : bool Should the node's output be cached? If not provided or None, uses default based on settings["CACHE_DATASOURCE_OUTPUT_DEFAULT"]. If True, outputs will be cached and retrieved from cache. If False, outputs will not be cached OR retrieved from cache (even if they exist in cache). Examples ----- # To use bilinear interpolation for [lat,lon] a specific interpolator for [time], and the default for [alt], use: >>> interp_node = Interpolation( source=some_node, interpolation=interpolation = [ { 'method': 'bilinear', 'dims': ['lat', 'lon'] }, { 'method': [podpac.interpolators.NearestNeighbor], 'dims': ['time'] } ] ) """ source = NodeTrait(allow_none=True).tag(attr=True) _source_xr = tl.Instance(UnitsDataArray, allow_none=True) # This is needed for the Interpolation Mixin interpolation = InterpolationTrait().tag(attr=True) cache_output = tl.Bool() # privates _interpolation = tl.Instance(InterpolationManager) _coordinates = tl.Instance(Coordinates, allow_none=True, default_value=None, read_only=True) _requested_source_coordinates = tl.Instance(Coordinates) _requested_source_coordinates_index = tl.Tuple() _requested_source_data = tl.Instance(UnitsDataArray) _evaluated_coordinates = tl.Instance(Coordinates) # this adds a more helpful error message if user happens to try an inspect _interpolation before evaluate @tl.default("_interpolation") def _default_interpolation(self): self._set_interpolation() return self._interpolation @tl.default("cache_output") def _cache_output_default(self): return settings["CACHE_NODE_OUTPUT_DEFAULT"] # ------------------------------------------------------------------------------------------------------------------ # Properties # ------------------------------------------------------------------------------------------------------------------ @property def interpolation_class(self): """Get the interpolation class currently set for this data source. The DataSource ``interpolation`` property is used to define the :class:`podpac.data.InterpolationManager` class that will handle interpolation for requested coordinates. Returns ------- :class:`podpac.data.InterpolationManager` InterpolationManager class defined by DataSource `interpolation` definition """ return self._interpolation @property def interpolators(self): """Return the interpolators selected for the previous node evaluation interpolation. If the node has not been evaluated, or if interpolation was not necessary, this will return an empty OrderedDict Returns ------- OrderedDict Key are tuple of unstacked dimensions, the value is the interpolator used to interpolate these dimensions """ if self._interpolation._last_interpolator_queue is not None: return self._interpolation._last_interpolator_queue else: return OrderedDict() def _set_interpolation(self): """Update _interpolation property""" # define interpolator with source coordinates dimensions if isinstance(self.interpolation, InterpolationManager): self._interpolation = self.interpolation else: self._interpolation = InterpolationManager(self.interpolation) def _eval(self, coordinates, output=None, _selector=None): """Evaluates this node using the supplied coordinates. The coordinates are mapped to the requested coordinates, interpolated if necessary, and set to `_requested_source_coordinates` with associated index `_requested_source_coordinates_index`. The requested source coordinates and index are passed to `get_data()` returning the source data at the coordinatesset to `_requested_source_data`. Finally `_requested_source_data` is interpolated using the `interpolate` method and set to the `output` attribute of the node. Parameters ---------- coordinates : :class:`podpac.Coordinates` {requested_coordinates} An exception is raised if the requested coordinates are missing dimensions in the DataSource. Extra dimensions in the requested coordinates are dropped. output : :class:`podpac.UnitsDataArray`, optional {eval_output} _selector : {eval_selector} Returns ------- {eval_return} Raises ------ ValueError Cannot evaluate these coordinates """ _logger.debug("Evaluating {} data source".format(self.__class__.__name__)) # store requested coordinates for debugging if settings["DEBUG"]: self._original_requested_coordinates = coordinates # store input coordinates to evaluated coordinates self._evaluated_coordinates = deepcopy(coordinates) # reset interpolation self._set_interpolation() selector = self._interpolation.select_coordinates source_out = self._source_eval(self._evaluated_coordinates, selector) source_coords = Coordinates.from_xarray(source_out.coords, crs=source_out.crs) # Drop extra coordinates extra_dims = [d for d in coordinates.udims if d not in source_coords.udims] coordinates = coordinates.drop(extra_dims) # Transform so that interpolation happens on the source data coordinate system if source_coords.crs.lower() != coordinates.crs.lower(): coordinates = coordinates.transform(source_coords.crs) if output is None: if "output" in source_out.dims: self.set_trait("outputs", source_out.coords["output"].data.tolist()) output = self.create_output_array(coordinates) if source_out.size == 0: # short cut return output # interpolate data into output output = self._interpolation.interpolate(source_coords, source_out, coordinates, output) # if requested crs is differented than coordinates, # fabricate a new output with the original coordinates and new values if self._evaluated_coordinates.crs != coordinates.crs: output = self.create_output_array(self._evaluated_coordinates.drop(extra_dims), data=output[:].values) # save output to private for debugging if settings["DEBUG"]: self._output = output self._source_xr = source_out return output def _source_eval(self, coordinates, selector, output=None): if isinstance(self._source_xr, UnitsDataArray): return self._source_xr else: return self.source.eval(coordinates, output=output, _selector=selector) def find_coordinates(self): """ Get the available coordinates for the Node. For a DataSource, this is just the coordinates. Returns ------- coords_list : list singleton list containing the coordinates (Coordinates object) """ return self.source.find_coordinates()
class Reproject(Interpolate): """ Create a Algorithm that evalutes a Node with one set of coordinates, and then interpolates it. This can be used to bilinearly interpolate an averaged dataset, for example. Attributes ---------- source : Node The source node. This node will use it's own, specified interpolation scheme interpolation : str Type of interpolation method to use for the interpolation coordinates : Coordinates, Node, str, dict Coordinates used to evaluate the source. These can be specified as a dictionary, json-formatted string, PODPAC Coordinates, or a PODPAC Node, where the node MUST implement the 'coordinates' attribute. reproject_dims : list Dimensions to reproject. The source will be evaluated with the reprojection coordinates in these dims and the requested coordinates for any other dims. """ coordinates = tl.Union( [NodeTrait(), tl.Dict(), tl.Unicode(), tl.Instance(Coordinates)], help= """Coordinates used to evaluate the source. These can be specified as a dictionary, json-formatted string, PODPAC Coordinates, or a PODPAC Node, where the node MUST implement the 'coordinates' attribute""", ).tag(attr=True) reproject_dims = tl.List(trait=tl.Unicode(), allow_none=True, default_value=None).tag(attr=True) @tl.validate("coordinates") def _validate_coordinates(self, d): val = d["value"] if isinstance(val, Node): if not hasattr(val, "coordinates"): raise ValueError( "When specifying the coordinates as a PODPAC Node, this Node must have a 'coordinates' attribute" ) elif isinstance(val, dict): Coordinates.from_definition(self.coordinates) elif isinstance(val, string_types): Coordinates.from_json(self.coordinates) return val @cached_property def reprojection_coordinates(self): # get coordinates if isinstance(self.coordinates, Coordinates): coordinates = self.coordinates elif isinstance(self.coordinates, Node): coordinates = self.coordinates.coordinates elif isinstance(self.coordinates, dict): coordinates = Coordinates.from_definition(self.coordinates) elif isinstance(self.coordinates, string_types): coordinates = Coordinates.from_json(self.coordinates) # drop non-reprojection dims if self.reproject_dims is not None: coordinates = coordinates.drop( [dim for dim in coordinates if dim not in self.reproject_dims]) return coordinates def _source_eval(self, coordinates, selector, output=None): coords = self.reprojection_coordinates.intersect(coordinates, outer=True) extra_eval_coords = coordinates.drop( self.reproject_dims or self.reprojection_coordinates.dims) if coords.crs != coordinates.crs: # Better to evaluate in reproject coordinate crs than eval crs for next step of interpolation extra_eval_coords = extra_eval_coords.transform(coords.crs) coords = merge_dims([coords, extra_eval_coords]) return self.source.eval(coords, output=output, _selector=selector) @property def base_ref(self): return "{}_reprojected".format(self.source.base_ref)
class ModifyCoordinates(UnaryAlgorithm): """ Base class for nodes that modify the requested coordinates before evaluation. Attributes ---------- source : podpac.Node Source node that will be evaluated with the modified coordinates. coordinates_source : podpac.Node Node that supplies the available coordinates when necessary, optional. The source node is used by default. lat, lon, time, alt : List Modification parameters for given dimension. Varies by node. """ coordinates_source = NodeTrait().tag(attr=True) lat = tl.List().tag(attr=True) lon = tl.List().tag(attr=True) time = tl.List().tag(attr=True) alt = tl.List().tag(attr=True) substitute_eval_coords = tl.Bool(False).tag(attr=True) _modified_coordinates = tl.Instance(Coordinates, allow_none=True) @tl.default("coordinates_source") def _default_coordinates_source(self): return self.source @common_doc(COMMON_DOC) def _eval(self, coordinates, output=None, _selector=None): """Evaluates this nodes using the supplied coordinates. Parameters ---------- coordinates : podpac.Coordinates {requested_coordinates} output : podpac.UnitsDataArray, optional {eval_output} _selector: callable(coordinates, request_coordinates) {eval_selector} Returns ------- {eval_return} Notes ------- The input coordinates are modified and the passed to the base class implementation of eval. """ self._requested_coordinates = coordinates self._modified_coordinates = Coordinates( [ self.get_modified_coordinates1d(coordinates, dim) for dim in coordinates.dims ], crs=coordinates.crs, validate_crs=False, ) for dim in self._modified_coordinates.udims: if self._modified_coordinates[dim].size == 0: raise ValueError( "Modified coordinates do not intersect with source data (dim '%s')" % dim) outputs = {} outputs["source"] = self.source.eval(self._modified_coordinates, output=output, _selector=_selector) if self.substitute_eval_coords: dims = outputs["source"].dims coords = self._requested_coordinates extra_dims = [d for d in coords.dims if d not in dims] coords = coords.drop(extra_dims) outputs["source"] = outputs["source"].assign_coords( **coords.xcoords) if output is None: output = outputs["source"] else: output[:] = outputs["source"] if settings["DEBUG"]: self._output = output return output
class ResampleReduce(UnaryAlgorithm): """ Resample a time-dependent source node using a statistical operation to achieve the result. Attributes ---------- custom_reduce_fn : function required if reduce_fn is 'custom'. resample : str datetime sub-accessor. Currently 'dayofyear' is the enabled option. reduce_fn : str builtin xarray groupby reduce function, or 'custom'. source : podpac.Node Source node """ _repr_keys = ["source", "resample", "reduce_fn"] coordinates_source = NodeTrait(allow_none=True).tag(attr=True) # see https://github.com/pydata/xarray/blob/eeb109d9181c84dfb93356c5f14045d839ee64cb/xarray/core/accessors.py#L61 resample = tl.Unicode().tag(attr=True) reduce_fn = tl.CaselessStrEnum(_REDUCE_FUNCTIONS).tag(attr=True) custom_reduce_fn = tl.Any(allow_none=True, default_value=None).tag(attr=True) _source_coordinates = tl.Instance(Coordinates) @tl.default("coordinates_source") def _default_coordinates_source(self): return self.source @common_doc(COMMON_DOC) def _eval(self, coordinates, output=None, _selector=None): """Evaluates this nodes using the supplied coordinates. Parameters ---------- coordinates : podpac.Coordinates {requested_coordinates} output : podpac.UnitsDataArray, optional {eval_output} _selector: callable(coordinates, request_coordinates) {eval_selector} Returns ------- {eval_return} Raises ------ ValueError If source it not time-dependent (required by this node). """ source_output = self.source.eval(coordinates, _selector=_selector) # group grouped = source_output.resample(time=self.resample) # reduce if self.reduce_fn == "custom": out = grouped.reduce(self.custom_reduce_fn) else: # standard, e.g. grouped.median('time') out = getattr(grouped, self.reduce_fn)() if output is None: output = podpac.UnitsDataArray(out) output.attrs = source_output.attrs else: output.data[:] = out.data[:] ## map # eval_time = xr.DataArray(coordinates.coords["time"]) # E = getattr(eval_time.dt, self.groupby) # out = out.sel(**{self.groupby: E}).rename({self.groupby: "time"}) # output[:] = out.transpose(*output.dims).data return output @property def base_ref(self): """ Default node reference/name in node definitions Returns ------- str Default node reference/name in node definitions """ return "%s.%s.%s" % (self.source.base_ref, self.resample, self.reduce_fn)