def eval(self, coordinates, output=None): """Evalutes this nodes using the supplied coordinates. Parameters ---------- coordinates : podpac.Coordinates {requested_coordinates} output : podpac.UnitsDataArray, optional {eval_output} Returns ------- {eval_return} """ self._requested_coordinates = coordinates inputs = {} for key, node in self._inputs.items(): inputs[key] = node.eval(coordinates) # accumulate output coordinates coords_list = [ Coordinates.from_xarray(a.coords) for a in inputs.values() ] output_coordinates = union([coordinates] + coords_list) result = self.algorithm(inputs) if isinstance(result, np.ndarray): if output is None: output = self.create_output_array(output_coordinates, data=result) else: output.data[:] = result elif isinstance(result, xr.DataArray): if output is None: output = self.create_output_array(Coordinates.from_xarray( result.coords), data=result.data) else: output[:] = result.data elif isinstance(result, UnitsDataArray): if output is None: output = result else: output[:] = result else: raise NodeException return output
def composite(self, coordinates, data_arrays, result=None): """Composites data_arrays in order that they appear. Once a request contains no nans, the result is returned. Parameters ---------- coordinates : :class:`podpac.Coordinates` {requested_coordinates} data_arrays : generator Evaluated source data, in the same order as the sources. result : podpac.UnitsDataArray, optional {eval_output} Returns ------- {eval_return} This composites the sources together until there are no nans or no more sources. """ # TODO: Fix boundary information on the combined data arrays res = next(data_arrays) for arr in data_arrays: res = res.combine_first(arr) res = UnitsDataArray(res) coords = Coordinates.from_xarray(res.coords) res.attrs["bounds"] = coords.bounds if result is not None: result.data[:] = res.transponse(*result.dims).data return result return res
def eval(self, coordinates, **kwargs): output = kwargs.get("output") if self.zarr_shape is None: self._shape = coordinates.shape else: self._shape = tuple(self.zarr_shape.values()) # initialize zarr file if self.zarr_chunks is None: chunks = [self.chunks[d] for d in coordinates] else: chunks = [self.zarr_chunks[d] for d in coordinates] self._chunks = chunks zf, data_key, zn = self.initialize_zarr_array(self._shape, chunks) self.dataset = zf self.zarr_data_key = data_key self.zarr_node = zn zn.keys # eval _log.debug("Starting parallel eval.") missing_dims = [ d for d in coordinates.dims if d not in self.chunks.keys() ] if self.zarr_coordinates is not None: missing_dims = missing_dims + [ d for d in self.zarr_coordinates.dims if d not in missing_dims ] set_coords = merge_dims( [coordinates.drop(missing_dims), self.zarr_coordinates]) else: set_coords = coordinates.drop(missing_dims) set_coords.transpose(*coordinates.dims) self.set_zarr_coordinates(set_coords, data_key) if self.list_dir: dk = data_key if isinstance(dk, list): dk = dk[0] self._list_dir = self.zarr_node.list_dir(dk) output = super(ZarrOutputMixin, self).eval(coordinates, output=output) # fill in the coordinates, this is guaranteed to be correct even if the user messed up. if output is not None: self.set_zarr_coordinates(Coordinates.from_xarray(output), data_key) else: return zf return output
def eval(self, coordinates, **kwargs): """ Wraps the super Node.eval method in order to cache with the correct coordinates. The output is independent of the crs or any extra dimensions, so this transforms and removes extra dimensions before caching in the super eval method. """ # check for missing dimensions for c in self.coordinates.values(): if isinstance(c, Coordinates1d): if c.name not in coordinates.udims: raise ValueError( "Cannot evaluate these coordinates, missing dim '%s'" % c.name) elif isinstance(c, StackedCoordinates): if all(dim not in coordinates.udims for dim in c.udims): raise ValueError( "Cannot evaluate these coordinates, missing at least one dim in '%s'" % c.name) # store original requested coordinates requested_coordinates = coordinates # remove extra dimensions extra = [ c.name for c in coordinates.values() if (isinstance(c, Coordinates1d) and c.name not in self.udims) or ( isinstance(c, StackedCoordinates) and all(dim not in self.udims for dim in c.dims)) ] coordinates = coordinates.drop(extra) # transform coordinates into native crs if different if coordinates.crs.lower() != self._crs.lower(): coordinates = coordinates.transform(self._crs) # note: super().eval (not self._eval) # This call already sub-selects an 'output' if specified output = super().eval(coordinates, **kwargs) # transform back to requested coordinates, if necessary if coordinates.crs.lower() != requested_coordinates.crs.lower(): # need to use the already-selected output, if it exists try: outputs = output["output"].data.tolist() if isinstance(outputs, str): # this will pass outputs=None to the create function, which is what we want in this case # which is when it is a single output (not a dim) outputs = [] except KeyError: # 'output' does not exist in the data, so outputs should be empty outputs = [] except Exception as e: outputs = self.outputs coords = Coordinates.from_xarray(output, crs=output.attrs.get("crs", None)) output = self.create_output_array(coords.transform( requested_coordinates.crs), data=output.data, outputs=outputs) if settings["DEBUG"]: self._requested_coordinates = requested_coordinates return output
def _eval(self, coordinates, output=None, _selector=None): """Evalutes this nodes using the supplied coordinates. Parameters ---------- coordinates : podpac.Coordinates {requested_coordinates} output : podpac.UnitsDataArray, optional {eval_output} _selector: callable(coordinates, request_coordinates) {eval_selector} Returns ------- {eval_return} """ self._requested_coordinates = coordinates inputs = {} if settings["MULTITHREADING"]: n_threads = thread_manager.request_n_threads(len(self.inputs)) if n_threads == 1: thread_manager.release_n_threads(n_threads) else: n_threads = 0 if settings["MULTITHREADING"] and n_threads > 1: # Create a function for each thread to execute asynchronously def f(node): return node.eval(coordinates, _selector=_selector) # Create pool of size n_threads, note, this may be created from a sub-thread (i.e. not the main thread) pool = thread_manager.get_thread_pool(processes=n_threads) # Evaluate nodes in parallel/asynchronously results = [pool.apply_async(f, [node]) for node in self.inputs.values()] # Collect the results in dictionary for key, res in zip(self.inputs.keys(), results): inputs[key] = res.get() # This prevents any more tasks from being submitted to the pool, and will close the workers once done pool.close() # Release these number of threads back to the thread pool thread_manager.release_n_threads(n_threads) self._multi_threaded = True else: # Evaluate nodes in serial for key, node in self.inputs.items(): inputs[key] = node.eval(coordinates, output=output, _selector=_selector) self._multi_threaded = False result = self.algorithm(inputs, coordinates) if not isinstance(result, xr.DataArray): raise NodeException("algorithm returned unsupported type '%s'" % type(result)) if "output" in result.dims and self.output is not None: result = result.sel(output=self.output) if output is not None: missing = [dim for dim in result.dims if dim not in output.dims] if any(missing): raise NodeException("provided output is missing dims %s" % missing) output_dims = output.dims output = output.transpose(..., *result.dims) output[:] = result.data output = output.transpose(*output_dims) elif isinstance(result, UnitsDataArray): output = result else: output_coordinates = Coordinates.from_xarray(result) output = self.create_output_array(output_coordinates, data=result.data) return output
def eval(self, coordinates, **kwargs): output = kwargs.get("output") # Make a thread pool to manage queue pool = ThreadPool(processes=self.number_of_workers) if output is None and self.fill_output: output = self.create_output_array(coordinates) shape = [] for d in coordinates.dims: if d in self.chunks: shape.append(self.chunks[d]) else: shape.append(coordinates[d].size) results = [] # inputs = [] i = 0 for coords, slc in coordinates.iterchunks(shape, True): # inputs.append(coords) if i < self.start_i: _log.debug( "Skipping {} since it is less than self.start_i ({})". format(i, self.start_i)) i += 1 continue out = None if self.fill_output and output is not None: out = output[slc] with self._lock: _log.debug("Added {} to worker pool".format(i)) _log.debug("Node eval with coords: {}, {}".format(slc, coords)) results.append( pool.apply_async(self.eval_source, [coords, slc, out, i])) i += 1 _log.info("Added all chunks to worker pool. Now waiting for results.") start_time = time.time() for i, res in enumerate(results): # _log.debug('Waiting for results: {} {}'.format(i, inputs[i])) dt = str( np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) _log.info("({}): Waiting for results: {} / {}".format( dt, i + 1, len(results))) # Try to get the results / wait for the results try: o, slc = res.get() except Exception as e: o = None slc = None self.errors.append((i, res, e)) dt = str( np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) _log.warning("({}) {} failed with exception {}".format( dt, i, e)) dt = str( np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) _log.info("({}) Finished result: {} / {}".format( time.time() - start_time, i + 1, len(results))) # Fill output if self.fill_output: if output is None: missing_dims = [ d for d in coordinates.dims if d not in self.chunks.keys() ] coords = coordinates.drop(missing_dims) missing_coords = Coordinates.from_xarray(o).drop( list(self.chunks.keys())) coords = merge_dims([coords, missing_coords]) coords = coords.transpose(*coordinates.dims) output = self.create_output_array(coords) output[slc] = o _log.info("Completed parallel execution.") pool.close() return output
def _eval(self, coordinates, output=None, _selector=None): """Evalutes this nodes using the supplied coordinates. Parameters ---------- coordinates : podpac.Coordinates {requested_coordinates} output : podpac.UnitsDataArray, optional {eval_output} _selector: callable(coordinates, request_coordinates) {eval_selector} Returns ------- {eval_return} """ self._requested_coordinates = coordinates inputs = {} if settings["MULTITHREADING"]: n_threads = thread_manager.request_n_threads(len(self.inputs)) if n_threads == 1: thread_manager.release_n_threads(n_threads) else: n_threads = 0 if settings["MULTITHREADING"] and n_threads > 1: # Create a function for each thread to execute asynchronously def f(node): return node.eval(coordinates, _selector=_selector) # Create pool of size n_threads, note, this may be created from a sub-thread (i.e. not the main thread) pool = thread_manager.get_thread_pool(processes=n_threads) # Evaluate nodes in parallel/asynchronously results = [ pool.apply_async(f, [node]) for node in self.inputs.values() ] # Collect the results in dictionary for key, res in zip(self.inputs.keys(), results): inputs[key] = res.get() # This prevents any more tasks from being submitted to the pool, and will close the workers once done pool.close() # Release these number of threads back to the thread pool thread_manager.release_n_threads(n_threads) self._multi_threaded = True else: # Evaluate nodes in serial for key, node in self.inputs.items(): inputs[key] = node.eval(coordinates, output=output, _selector=_selector) self._multi_threaded = False # accumulate output coordinates coords_list = [ Coordinates.from_xarray(a.coords, crs=a.attrs.get("crs")) for a in inputs.values() ] output_coordinates = union([coordinates] + coords_list) result = self.algorithm(inputs) if isinstance(result, UnitsDataArray): if output is None: output = result else: output[:] = result.data[:] elif isinstance(result, xr.DataArray): if output is None: output = self.create_output_array(Coordinates.from_xarray( result.coords, crs=result.attrs.get("crs")), data=result.data) else: output[:] = result.data elif isinstance(result, np.ndarray): if output is None: output = self.create_output_array(output_coordinates, data=result) else: output.data[:] = result else: raise NodeException if "output" in output.dims and self.output is not None: output = output.sel(output=self.output) return output
def _eval(self, coordinates, output=None, _selector=None): """Evaluates this node using the supplied coordinates. The coordinates are mapped to the requested coordinates, interpolated if necessary, and set to `_requested_source_coordinates` with associated index `_requested_source_coordinates_index`. The requested source coordinates and index are passed to `get_data()` returning the source data at the coordinatesset to `_requested_source_data`. Finally `_requested_source_data` is interpolated using the `interpolate` method and set to the `output` attribute of the node. Parameters ---------- coordinates : :class:`podpac.Coordinates` {requested_coordinates} An exception is raised if the requested coordinates are missing dimensions in the DataSource. Extra dimensions in the requested coordinates are dropped. output : :class:`podpac.UnitsDataArray`, optional {eval_output} _selector : {eval_selector} Returns ------- {eval_return} Raises ------ ValueError Cannot evaluate these coordinates """ _logger.debug("Evaluating {} data source".format(self.__class__.__name__)) # store requested coordinates for debugging if settings["DEBUG"]: self._original_requested_coordinates = coordinates # store input coordinates to evaluated coordinates self._evaluated_coordinates = deepcopy(coordinates) # reset interpolation self._set_interpolation() selector = self._interpolation.select_coordinates source_out = self._source_eval(self._evaluated_coordinates, selector) source_coords = Coordinates.from_xarray(source_out.coords, crs=source_out.crs) # Drop extra coordinates extra_dims = [d for d in coordinates.udims if d not in source_coords.udims] coordinates = coordinates.drop(extra_dims) # Transform so that interpolation happens on the source data coordinate system if source_coords.crs.lower() != coordinates.crs.lower(): coordinates = coordinates.transform(source_coords.crs) if output is None: if "output" in source_out.dims: self.set_trait("outputs", source_out.coords["output"].data.tolist()) output = self.create_output_array(coordinates) if source_out.size == 0: # short cut return output # interpolate data into output output = self._interpolation.interpolate(source_coords, source_out, coordinates, output) # if requested crs is differented than coordinates, # fabricate a new output with the original coordinates and new values if self._evaluated_coordinates.crs != coordinates.crs: output = self.create_output_array(self._evaluated_coordinates.drop(extra_dims), data=output[:].values) # save output to private for debugging if settings["DEBUG"]: self._output = output self._source_xr = source_out return output