def json_attributes(self, vfuncs=None): """ vfuncs can be any callable that accepts a single argument, the Variable object, and returns a dictionary of new attributes to set. These will overwrite existing attributes """ vfuncs = vfuncs or [] js = {'global': {}} for k in self.ncattrs(): js['global'][k] = self.getncattr(k) for varname, var in self.variables.items(): js[varname] = {} for k in var.ncattrs(): z = var.getncattr(k) try: assert not np.isnan(z).all() js[varname][k] = z except AssertionError: js[varname][k] = None except TypeError: js[varname][k] = z for vf in vfuncs: try: js[varname].update(vfuncs(var)) except BaseException: logger.exception( "Could not apply custom variable attribue function") return json.loads(json.dumps(js, cls=BasicNumpyEncoder))
def from_directory(cls, directory, suffix=".nc", subdirs=True, dimName='time', apply_to_members=None): if not os.path.isdir(directory): logger.error("Directory {0} does not exists or I do not have the correct permissions to access".format(directory)) # Create NcML pointing to the directory ncml = """<?xml version="1.0" encoding="UTF-8"?> <netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2"> <aggregation dimName="{0}" type="joinExisting"> <scan location="{1}" suffix="{2}" subdirs="{3}" /> </aggregation> </netcdf> """.format(dimName, directory, suffix, subdirs) try: return cls(pyncml.scan(ncml, apply_to_members=apply_to_members)) except BaseException: logger.exception("Could not load Collection from Directory.")
def to_dataframe(self, clean_cols=True, clean_rows=True): # Z zvar = self.z_axes()[0] z = np.ma.fix_invalid(np.ma.MaskedArray(zvar[:])) z = z.flatten().round(5) logger.debug(['z data size: ', z.size]) # T tvar = self.t_axes()[0] t = np.ma.MaskedArray(nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard'))).flatten() # Patch the time variable back to its original mask, since num2date # breaks any missing/fill values if hasattr(tvar[0], 'mask'): t.mask = tvar[:].mask logger.debug(['time data size: ', t.size]) # X xvar = self.x_axes()[0] x = np.ma.fix_invalid(np.ma.MaskedArray(xvar[:])).flatten().round(5) logger.debug(['x data size: ', x.size]) # Y yvar = self.y_axes()[0] y = np.ma.fix_invalid(np.ma.MaskedArray(yvar[:])).flatten().round(5) logger.debug(['y data size: ', y.size]) # Trajectories pvar = self.get_variables_by_attributes(cf_role='trajectory_id')[0] try: p = normalize_array(pvar) except BaseException: logger.exception('Could not pull trajectory values from the variable, using indexes.') p = np.asarray(list(range(len(pvar))), dtype=np.integer) # The Dimension that the trajectory id variable doesn't have is what # the trajectory data needs to be repeated by dim_diff = self.dimensions[list(set(tvar.dimensions).difference(set(pvar.dimensions)))[0]] if dim_diff: p = p.repeat(dim_diff.size) logger.debug(['trajectory data size: ', p.size]) # Distance d = np.append([0], great_distance(start_latitude=y[0:-1], end_latitude=y[1:], start_longitude=x[0:-1], end_longitude=x[1:])['distance']) d = np.ma.fix_invalid(np.ma.MaskedArray(np.cumsum(d)).astype(np.float64).round(2)) logger.debug(['distance data size: ', d.size]) df_data = { 't': t, 'x': x, 'y': y, 'z': z, 'trajectory': p, 'distance': d } building_index_to_drop = np.ones(t.size, dtype=bool) extract_vars = list(set(self.data_vars() + self.ancillary_vars())) for i, dvar in enumerate(extract_vars): vdata = np.ma.fix_invalid(np.ma.MaskedArray(dvar[:].round(3).flatten())) building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True) # noqa df_data[dvar.name] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def from_glob(cls, glob_string, timevar_name='time', ncml=None): dataset_name = None dataset_starting = None dataset_ending = None dataset_variables = [] dataset_members = [] files = glob(glob_string) logger.info("Processing aggregation containing {!s} files".format(len(files))) for i, filepath in enumerate(files): logger.info("Processing member ({0}/{1}) - {2} ".format(i+1, len(files), filepath)) nc = None try: if ncml is not None: # Apply NcML tmp_f, tmp_fp = tempfile.mkstemp(prefix="nc") os.close(tmp_f) nc = pyncml.apply(filepath, ncml, output_file=tmp_fp) else: nc = netCDF4.Dataset(filepath) if dataset_name is None: if hasattr(nc, 'name'): dataset_name = nc.name elif hasattr(nc, 'title'): dataset_name = nc.title else: dataset_name = "Pyaxiom Glob Dataset" timevar = nc.variables.get(timevar_name) if timevar is None: logger.error("Time variable '{0}' was not found in file '{1}'. Skipping.".format(timevar_name, filepath)) continue # Start/Stop of NetCDF file starting = netCDF4.num2date(np.min(timevar[:]), units=timevar.units) ending = netCDF4.num2date(np.max(timevar[:]), units=timevar.units) variables = filter(None, [ nc.variables[v].standard_name if hasattr(nc.variables[v], 'standard_name') else None for v in nc.variables.keys() ]) dataset_variables = list(set(dataset_variables + variables)) if starting.tzinfo is None: starting = starting.replace(tzinfo=pytz.utc) if ending.tzinfo is None: ending = ending.replace(tzinfo=pytz.utc) if dataset_starting is None or starting < dataset_starting: dataset_starting = starting if dataset_ending is None or ending > dataset_ending: dataset_ending = ending member = DotDict(path=filepath, standard_names=variables, starting=starting, ending=ending) dataset_members.append(member) except BaseException: logger.exception("Something went wrong with {0}".format(filepath)) continue finally: nc.close() try: os.remove(tmp_fp) except (OSError, UnboundLocalError): pass dataset_members = sorted(dataset_members, key=operator.attrgetter('starting')) return cls(DotDict(name=dataset_name, timevar_name=timevar_name, starting=dataset_starting, ending=dataset_ending, standard_names=dataset_variables, members=dataset_members))
def from_ncml_file(cls, ncml_path, apply_to_members=None): try: with open(ncml_path) as f: return cls(pyncml.scan(f.read(), apply_to_members=apply_to_members)) except BaseException: logger.exception("Could not load Collection from NcML. Please check the NcML.")
def from_glob(cls, glob_string, timevar_name='time', ncml=None): dataset_name = None dataset_starting = None dataset_ending = None dataset_variables = [] dataset_members = [] files = glob(glob_string) logger.info("Processing aggregation containing {!s} files".format(len(files))) for i, filepath in enumerate(files): logger.info("Processing member ({0}/{1}) - {2} ".format(i+1, len(files), filepath)) nc = None try: if ncml is not None: # Apply NcML tmp_f, tmp_fp = tempfile.mkstemp(prefix="nc") os.close(tmp_f) nc = pyncml.apply(filepath, ncml, output_file=tmp_fp) else: nc = netCDF4.Dataset(filepath) if dataset_name is None: if 'name' in nc.ncattrs(): dataset_name = nc.name elif 'title' in nc.ncattrs(): dataset_name = nc.title else: dataset_name = "Pyaxiom Glob Dataset" timevar = nc.variables.get(timevar_name) if timevar is None: logger.error("Time variable '{0}' was not found in file '{1}'. Skipping.".format(timevar_name, filepath)) continue # Start/Stop of NetCDF file starting = netCDF4.num2date(np.min(timevar[:]), units=timevar.units) ending = netCDF4.num2date(np.max(timevar[:]), units=timevar.units) variables = list([_f for _f in [ nc.variables[v].standard_name if hasattr(nc.variables[v], 'standard_name') else None for v in list(nc.variables.keys()) ] if _f]) dataset_variables = list(set(dataset_variables + variables)) if starting.tzinfo is None: starting = starting.replace(tzinfo=pytz.utc) if ending.tzinfo is None: ending = ending.replace(tzinfo=pytz.utc) if dataset_starting is None or starting < dataset_starting: dataset_starting = starting if dataset_ending is None or ending > dataset_ending: dataset_ending = ending member = DotDict(path=filepath, standard_names=variables, starting=starting, ending=ending) dataset_members.append(member) except BaseException: logger.exception("Something went wrong with {0}".format(filepath)) continue finally: nc.close() try: os.remove(tmp_fp) except (OSError, UnboundLocalError): pass dataset_members = sorted(dataset_members, key=operator.attrgetter('starting')) return cls(DotDict(name=dataset_name, timevar_name=timevar_name, starting=dataset_starting, ending=dataset_ending, standard_names=dataset_variables, members=dataset_members))
def add_variable(self, variable_name, values, times=None, verticals=None, sensor_vertical_datum=None, attributes=None, unlink_from_profile=None, fillvalue=None, raise_on_error=False): if isinstance(values, (list, tuple,)) and values: values = np.asarray(values) if isinstance(times, (list, tuple,)) and times: times = np.asarray(times) if isinstance(verticals, (list, tuple,)) and verticals: verticals = np.asarray(verticals) # Set vertical datum on the CRS variable if sensor_vertical_datum is not None: try: self.crs.geoid_name = sensor_vertical_datum self.crs.vertical_datum = sensor_vertical_datum self.crs.water_surface_reference_datum = sensor_vertical_datum except AttributeError: pass # Set default fillvalue for new variables if fillvalue is None: fillvalue = -9999.9 used_values = None try: if unlink_from_profile is True: used_values = np.ma.reshape(values, (self.time.size, )) used_values = used_values[self.time_indexes] # These next two cases should work for all but a few cases, which are caught below elif self.z.size == 1: used_values = np.ma.reshape(values, (self.time.size, )) used_values = used_values[self.time_indexes] else: used_values = np.ma.reshape(values, (self.time.size, self.z.size, )) used_values = used_values[self.time_indexes] try: used_values = used_values[:, self.vertical_indexes] except IndexError: # The vertical values most likely had duplicates. Ignore the # falty index here and try to save the values as is. pass except ValueError: if raise_on_error is True: self.close() raise else: logger.exception("Could not do a simple reshape of data, trying to match manually! Time:{!s}, Heights:{!s}, Values:{!s}".format(self.time.size, self.z.size, values.size)) if self.z.size > 1: if times is not None and verticals is not None: # Hmmm, we have two actual height values for this station. # Not cool man, not cool. # Reindex the entire values array. This is slow. indexed = ((bisect.bisect_left(self.time[:], times[i]), bisect.bisect_left(self.z[:], verticals[i]), values[i]) for i in xrange(values.size)) used_values = np.ndarray((self.time.size, self.z.size, ), dtype=np.float64) used_values.fill(float(fillvalue)) for (tzi, zzi, vz) in indexed: if zzi < self.z.size and tzi < self.time.size: used_values[tzi, zzi] = vz else: self.close() raise ValueError("You need to pass in both 'times' and 'verticals' parameters that matches the size of the 'values' parameter.") else: if times is not None: # Ugh, find the time indexes manually indexed = ((bisect.bisect_left(self.time[:], times[i]), values[i]) for i in xrange(values.size)) used_values = np.ndarray((self.time.size, ), dtype=np.float64) used_values.fill(float(fillvalue)) for (tzi, vz) in indexed: if tzi < self.time.size: used_values[tzi] = vz else: self.close() raise ValueError("You need to pass in a 'times' parameter that matches the size of the 'values' parameter.") logger.info("Setting values for {}...".format(variable_name)) if len(used_values.shape) == 1: var = self.nc.createVariable(variable_name, "f8", ("time",), fill_value=fillvalue, chunksizes=(1000,), zlib=True) if self.z.size == 1: var.coordinates = "{} {} latitude longitude".format(self.time_axis_name, self.vertical_axis_name) else: # This is probably a bottom sensor on an ADCP or something, don't add the height coordinate var.coordinates = "time latitude longitude" if unlink_from_profile is True: # Create metadata variable for the sensor_depth if self.nc.variables.get('sensor_depth') is None: logger.info("Setting the special case 'sensor_depth' metadata variable") inst_depth = self.nc.createVariable('sensor_depth', 'f4') inst_depth.units = 'm' inst_depth.standard_name = 'surface_altitude' inst_depth.long_name = 'sensor depth below datum' inst_depth.positive = self.vertical_positive inst_depth.datum = sensor_vertical_datum or 'Unknown' inst_depth[:] = verticals[0] * -1 elif len(used_values.shape) == 2: var = self.nc.createVariable(variable_name, "f8", ("time", "z",), fill_value=fillvalue, chunksizes=(1000, self.z.size,), zlib=True) var.coordinates = "{} {} latitude longitude".format(self.time_axis_name, self.vertical_axis_name) else: raise ValueError("Could not create variable. Shape of data is {!s}. Expected a dimension of 1 or 2, not {!s}.".format(used_values.shape, len(used_values.shape))) # Set the variable attributes as passed in if attributes: for k, v in attributes.iteritems(): if k == 'vertical_datum' and sensor_vertical_datum is None and v is not None: # Use this as the vertical datum if it is specified and we didn't already have one try: self.crs.geoid_name = v self.crs.vertical_datum = v self.crs.water_surface_reference_datum = v except AttributeError: pass if k != '_FillValue' and v is not None: try: setattr(var, k, v) except BaseException: logger.info('Could not add attribute {}: {}, skipping.'.format(k, v)) var.grid_mapping = 'crs' var[:] = used_values return var