def test_normalization_of_string_arrays_netcdf3(self): thestr = 'boodsfasfasdfm' with nc4.Dataset(self.fp, 'w', format="NETCDF3_CLASSIC") as ncd: dimsize = len(thestr) ncd.createDimension('n', dimsize) # Single str (no dimension) ncd.createVariable('single_S', 'S1', ('n',)) for k, v in ncd.variables.items(): if k.startswith('single_'): v[:] = nc4.stringtoarr(thestr, dimsize) # Array of strq ncd.createVariable('many_S', 'S1', ('n', 'n',)) for k, v in ncd.variables.items(): if k.startswith('many_'): v[:, :] = np.tile(nc4.stringtoarr(thestr, dimsize), dimsize).reshape(v.shape) with nc4.Dataset(self.fp) as ncd: assert normalize_array(ncd.variables['single_S']) == thestr assert np.all(normalize_array(ncd.variables['many_S']) == [thestr] * dimsize)
def is_mine(cls, dsg, strict=False): try: assert dsg.featureType.lower() == 'timeseriesprofile' assert len(dsg.t_axes()) >= 1 assert len(dsg.x_axes()) >= 1 assert len(dsg.y_axes()) >= 1 assert len(dsg.z_axes()) >= 1 o_index_vars = dsg.filter_by_attrs( sample_dimension=lambda x: x is not None ) assert len(o_index_vars) == 1 assert o_index_vars[0].sample_dimension in dsg.dimensions # Sample dimension _ = dsg.filter_by_attrs( cf_role='profile_id' )[0] svar = dsg.filter_by_attrs( cf_role='timeseries_id' )[0] sdata = normalize_array(svar) if not isinstance(sdata, str) and len(sdata.shape) > 0: r_index_vars = dsg.filter_by_attrs( instance_dimension=lambda x: x is not None ) assert len(r_index_vars) == 1 assert r_index_vars[0].instance_dimension in dsg.dimensions # Station dimension except BaseException: if strict is True: raise return False return True
def is_mine(cls, dsg): try: assert dsg.featureType.lower() == 'timeseriesprofile' assert len(dsg.t_axes()) >= 1 assert len(dsg.x_axes()) >= 1 assert len(dsg.y_axes()) >= 1 assert len(dsg.z_axes()) >= 1 o_index_vars = dsg.filter_by_attrs( sample_dimension=lambda x: x is not None) assert len(o_index_vars) == 1 assert o_index_vars[ 0].sample_dimension in dsg.dimensions # Sample dimension svar = dsg.filter_by_attrs(cf_role='timeseries_id')[0] sdata = normalize_array(svar) if len(sdata.shape) > 0: r_index_vars = dsg.filter_by_attrs( instance_dimension=lambda x: x is not None) assert len(r_index_vars) == 1 assert r_index_vars[ 0].instance_dimension in dsg.dimensions # Station dimension except AssertionError: return False return True
def test_normalization_of_string_arrays_netcdf4(self): thestr = 'bosadfsdfkljskfusdiofu987987987om' with nc4.Dataset(self.fp, 'w', format="NETCDF4") as ncd: dimsize = len(thestr) ncd.createDimension('n', dimsize) # Single str (no dimension) ncd.createVariable('single_str', str) ncd.createVariable('single_unicode_', np.unicode_) ncd.createVariable('single_U', '<U1') ncd.createVariable('single_S', 'S1', ('n', )) for k, v in ncd.variables.items(): if k.startswith('single_'): if v.dimensions: v[:] = nc4.stringtoarr(thestr, dimsize) else: v[0] = thestr # Array of str ncd.createVariable('many_str', str, ('n', )) ncd.createVariable('many_unicode_', np.unicode_, ('n', )) ncd.createVariable('many_U', '<U1', ('n', )) ncd.createVariable('many_S', 'S1', ( 'n', 'n', )) for k, v in ncd.variables.items(): if k.startswith('many_'): if len(v.dimensions) > 1: v[:, :] = np.tile(nc4.stringtoarr(thestr, dimsize), dimsize) else: v[:] = np.tile(thestr, dimsize) with nc4.Dataset(self.fp) as ncd: assert normalize_array(ncd.variables['single_str']) == thestr assert normalize_array(ncd.variables['single_unicode_']) == thestr assert normalize_array(ncd.variables['single_U']) == thestr assert normalize_array(ncd.variables['single_S']) == thestr assert np.all( normalize_array(ncd.variables['many_str']) == [thestr] * len(thestr)) assert np.all( normalize_array(ncd.variables['many_unicode_']) == [thestr] * len(thestr)) assert np.all( normalize_array(ncd.variables['many_U']) == [thestr] * len(thestr)) assert np.all( normalize_array(ncd.variables['many_S']) == [thestr] * len(thestr))
def is_mine(cls, dsg): try: tvars = dsg.filter_by_attrs(cf_role='trajectory_id') assert len(tvars) == 1 assert dsg.featureType.lower() == 'trajectory' assert len(dsg.t_axes()) == 1 assert len(dsg.x_axes()) == 1 assert len(dsg.y_axes()) == 1 assert len(dsg.z_axes()) == 1 # Allow for string variables tvar = tvars[0] # 0 = single # 1 = array of strings/ints/bytes/etc # 2 = array of character arrays assert 0 <= len(tvar.dimensions) <= 2 ts = normalize_array(tvar) is_single = ts.size == 1 t = dsg.t_axes()[0] x = dsg.x_axes()[0] y = dsg.y_axes()[0] z = dsg.z_axes()[0] assert t.dimensions == x.dimensions == y.dimensions == z.dimensions assert t.size == x.size == y.size == z.size if is_single: assert len(t.dimensions) == 1 time_dim = dsg.dimensions[t.dimensions[0]] for dv in dsg.data_vars(): assert len(dv.dimensions) == 1 assert time_dim.name in dv.dimensions assert dv.size == time_dim.size else: # This `time` being two dimensional is unique to IncompleteMultidimensionalTrajectory assert len(t.dimensions) == 2 t_dim = dsg.dimensions[t.dimensions[0]] o_dim = dsg.dimensions[t.dimensions[1]] for dv in dsg.data_vars(): assert dv.size == t.size assert len(dv.dimensions) == 2 assert t_dim.name in dv.dimensions assert o_dim.name in dv.dimensions assert dv.size == t_dim.size * o_dim.size except BaseException: return False return True
def is_mine(cls, dsg): try: pvars = dsg.filter_by_attrs(cf_role='profile_id') assert len(pvars) == 1 assert dsg.featureType.lower() == 'profile' assert len(dsg.t_axes()) == 1 assert len(dsg.x_axes()) == 1 assert len(dsg.y_axes()) == 1 assert len(dsg.z_axes()) == 1 # Allow for string variables pvar = pvars[0] # 0 = single # 1 = array of strings/ints/bytes/etc # 2 = array of character arrays assert 0 <= len(pvar.dimensions) <= 2 ps = normalize_array(pvar) is_single = ps.size == 1 t = dsg.t_axes()[0] x = dsg.x_axes()[0] y = dsg.y_axes()[0] z = dsg.z_axes()[0] assert len(z.dimensions) == 1 z_dim = dsg.dimensions[z.dimensions[0]] if is_single: assert t.size == 1 assert x.size == 1 assert y.size == 1 for dv in dsg.data_vars(): assert len(dv.dimensions) == 1 assert z_dim.name in dv.dimensions assert dv.size == z_dim.size else: assert t.size == pvar.size assert x.size == pvar.size assert y.size == pvar.size p_dim = dsg.dimensions[pvar.dimensions[0]] for dv in dsg.data_vars(): assert len(dv.dimensions) == 2 assert z_dim.name in dv.dimensions assert p_dim.name in dv.dimensions assert dv.size == z_dim.size * p_dim.size except BaseException: return False return True
def to_dataframe(self, clean_cols=True, clean_rows=True): # The index variable (trajectory_index) is identified by having an # attribute with name of instance_dimension whose value is the instance # dimension name (trajectory in this example). The index variable must # have the profile dimension as its sole dimension, and must be type # integer. Each value in the index variable is the zero-based trajectory # index that the profile belongs to i.e. profile p belongs to trajectory # i=trajectory_index(p), as in section H.2.5. r_index_var = self.filter_by_attrs(instance_dimension=lambda x: x is not None)[0] p_dim = self.dimensions[r_index_var.dimensions[0]] # Profile dimension r_dim = self.dimensions[r_index_var.instance_dimension] # Trajectory dimension # The count variable (row_size) contains the number of elements for # each profile, which must be written contiguously. The count variable # is identified by having an attribute with name sample_dimension whose # value is the sample dimension (obs in this example) being counted. It # must have the profile dimension as its sole dimension, and must be # type integer o_index_var = self.filter_by_attrs(sample_dimension=lambda x: x is not None)[0] o_dim = self.dimensions[o_index_var.sample_dimension] # Sample dimension try: rvar = self.filter_by_attrs(cf_role='trajectory_id')[0] traj_indexes = normalize_array(rvar) assert traj_indexes.size == r_dim.size except BaseException: logger.warning('Could not pull trajectory values a variable with "cf_role=trajectory_id", using a computed range.') traj_indexes = np.arange(r_dim.size) try: pvar = self.filter_by_attrs(cf_role='profile_id')[0] profile_indexes = normalize_array(pvar) assert profile_indexes.size == p_dim.size except BaseException: logger.warning('Could not pull profile values from a variable with "cf_role=profile_id", using a computed range.') profile_indexes = np.arange(p_dim.size) # Profile dimension tvars = self.t_axes() if len(tvars) > 1: tvar = [ v for v in self.t_axes() if v.dimensions == (p_dim.name,) and getattr(v, 'axis', '').lower() == 't' ][0] else: tvar = tvars[0] xvars = self.x_axes() if len(xvars) > 1: xvar = [ v for v in self.x_axes() if v.dimensions == (p_dim.name,) and getattr(v, 'axis', '').lower() == 'x' ][0] else: xvar = xvars[0] yvars = self.y_axes() if len(yvars) > 1: yvar = [ v for v in self.y_axes() if v.dimensions == (p_dim.name,) and getattr(v, 'axis', '').lower() == 'y' ][0] else: yvar = yvars[0] zvars = self.z_axes() if len(zvars) > 1: zvar = [ v for v in self.z_axes() if v.dimensions == (o_dim.name,) and getattr(v, 'axis', '').lower() == 'z' ][0] else: zvar = zvars[0] p = np.ma.masked_all(o_dim.size, dtype=profile_indexes.dtype) r = np.ma.masked_all(o_dim.size, dtype=traj_indexes.dtype) t = np.ma.masked_all(o_dim.size, dtype=tvar.dtype) x = np.ma.masked_all(o_dim.size, dtype=xvar.dtype) y = np.ma.masked_all(o_dim.size, dtype=yvar.dtype) si = 0 for i in np.arange(profile_indexes.size): ei = si + o_index_var[i] p[si:ei] = profile_indexes[i] r[si:ei] = traj_indexes[r_index_var[i]] t[si:ei] = tvar[i] x[si:ei] = xvar[i] y[si:ei] = yvar[i] si = ei t_mask = False tfill = get_fill_value(tvar) if tfill is not None: t_mask = np.copy(np.ma.getmaskarray(t)) t[t_mask] = 1 t = np.ma.MaskedArray( nc4.num2date(t, tvar.units, getattr(tvar, 'calendar', 'standard')) ) # Patch the time variable back to its original mask, since num2date # breaks any missing/fill values t[t_mask] = np.ma.masked # X and Y x = generic_masked(x, minv=-180, maxv=180).round(5) y = generic_masked(y, minv=-90, maxv=90).round(5) # Distance d = np.ma.zeros(o_dim.size, dtype=np.float64) d[1:] = great_distance(start_latitude=y[0:-1], end_latitude=y[1:], start_longitude=x[0:-1], end_longitude=x[1:])['distance'] d = generic_masked(np.cumsum(d), minv=0).round(2) # Sample dimension z = generic_masked(zvar[:].flatten(), attrs=self.vatts(zvar.name)).round(5) df_data = { 't': t, 'x': x, 'y': y, 'z': z, 'trajectory': r, 'profile': p, 'distance': d } building_index_to_drop = np.ones(o_dim.size, dtype=bool) extract_vars = list(set(self.data_vars() + self.ancillary_vars())) for i, dvar in enumerate(extract_vars): # Profile dimensions if dvar.dimensions == (p_dim.name,): vdata = np.ma.masked_all(o_dim.size, dtype=dvar.dtype) si = 0 for j in np.arange(profile_indexes.size): ei = si + o_index_var[j] vdata[si:ei] = dvar[j] si = ei # Sample dimensions elif dvar.dimensions == (o_dim.name,): vdata = generic_masked(dvar[:].flatten(), attrs=self.vatts(dvar.name)).round(3) else: logger.warning("Skipping variable {}... it didn't seem like a data variable".format(dvar)) building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True) # noqa df_data[dvar.name] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def is_mine(cls, dsg, strict=False): try: pvars = dsg.filter_by_attrs(cf_role='profile_id') assert len(pvars) == 1 assert dsg.featureType.lower() == 'profile' assert len(dsg.t_axes()) >= 1 assert len(dsg.x_axes()) >= 1 assert len(dsg.y_axes()) >= 1 assert len(dsg.z_axes()) >= 1 # Allow for string variables pvar = pvars[0] # 0 = single # 1 = array of strings/ints/bytes/etc # 2 = array of character arrays assert 0 <= len(pvar.dimensions) <= 2 t = dsg.t_axes()[0] x = dsg.x_axes()[0] y = dsg.y_axes()[0] z = dsg.z_axes()[0] assert len(z.dimensions) == 1 z_dim = dsg.dimensions[z.dimensions[0]] ps = normalize_array(pvar) is_single = False if pvar.ndim == 0: is_single = True elif pvar.ndim == 2: is_single = False elif isinstance(ps, six.string_types): # Non-dimensioned string variable is_single = True elif pvar.ndim == 1 and hasattr(ps, 'dtype') and ps.dtype.kind in ['U', 'S']: is_single = True if is_single: assert t.size == 1 assert x.size == 1 assert y.size == 1 for dv in dsg.data_vars(): assert len(dv.dimensions) == 1 assert z_dim.name in dv.dimensions assert dv.size == z_dim.size else: assert t.size == pvar.size assert x.size == pvar.size assert y.size == pvar.size p_dim = dsg.dimensions[pvar.dimensions[0]] for dv in dsg.data_vars(): assert len(dv.dimensions) in [1, 2] # dimensioned by profile or profile, z assert z_dim.name in dv.dimensions or p_dim.name in dv.dimensions assert dv.size in [z_dim.size, p_dim.size, z_dim.size * p_dim.size] except BaseException: if strict is True: raise return False return True
def to_dataframe(self, clean_cols=True, clean_rows=True): zvar = self.z_axes()[0] zs = len(self.dimensions[zvar.dimensions[0]]) # Profiles pvar = self.filter_by_attrs(cf_role='profile_id')[0] try: p = normalize_array(pvar) except ValueError: p = np.asarray(list(range(len(pvar))), dtype=np.integer) ps = p.size p = p.repeat(zs) logger.debug(['profile data size: ', p.size]) # Z z = generic_masked(zvar[:], attrs=self.vatts(zvar.name)).round(5) try: z = np.tile(z, ps) except ValueError: z = z.flatten() logger.debug(['z data size: ', z.size]) # T tvar = self.t_axes()[0] t = nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard')) if isinstance(t, datetime): # Size one t = np.array([t.isoformat()], dtype='datetime64') t = t.repeat(zs) logger.debug(['time data size: ', t.size]) # X xvar = self.x_axes()[0] x = generic_masked(xvar[:].repeat(zs), attrs=self.vatts(xvar.name)).round(5) logger.debug(['x data size: ', x.size]) # Y yvar = self.y_axes()[0] y = generic_masked(yvar[:].repeat(zs), attrs=self.vatts(yvar.name)).round(5) logger.debug(['y data size: ', y.size]) # Distance d = np.ma.zeros(y.size, dtype=np.float64) d[1:] = great_distance(start_latitude=y[0:-1], end_latitude=y[1:], start_longitude=x[0:-1], end_longitude=x[1:])['distance'] d = generic_masked(np.cumsum(d), minv=0).round(2) logger.debug(['distance data size: ', d.size]) df_data = {'t': t, 'x': x, 'y': y, 'z': z, 'profile': p, 'distance': d} building_index_to_drop = np.ones(t.size, dtype=bool) extract_vars = list(set(self.data_vars() + self.ancillary_vars())) for i, dvar in enumerate(extract_vars): vdata = np.ma.fix_invalid( np.ma.MaskedArray(dvar[:].round(3).flatten())) building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa df_data[dvar.name] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def is_mine(cls, dsg, strict=False): try: tvars = dsg.filter_by_attrs(cf_role='trajectory_id') assert len(tvars) == 1 assert dsg.featureType.lower() == 'trajectory' assert len(dsg.t_axes()) >= 1 assert len(dsg.x_axes()) >= 1 assert len(dsg.y_axes()) >= 1 assert len(dsg.z_axes()) >= 1 # Allow for string variables tvar = tvars[0] # 0 = single # 1 = array of strings/ints/bytes/etc # 2 = array of character arrays assert 0 <= len(tvar.dimensions) <= 2 ts = normalize_array(tvar) is_single = False if tvar.ndim == 0: is_single = True elif tvar.ndim == 2: is_single = False elif isinstance(ts, six.string_types): # Non-dimensioned string variable is_single = True elif tvar.ndim == 1 and hasattr(ts, 'dtype') and ts.dtype.kind in ['U', 'S']: is_single = True t = dsg.t_axes()[0] x = dsg.x_axes()[0] y = dsg.y_axes()[0] z = dsg.z_axes()[0] assert t.dimensions == x.dimensions == y.dimensions == z.dimensions assert t.size == x.size == y.size == z.size if is_single: assert len(t.dimensions) == 1 t_dim = dsg.dimensions[t.dimensions[0]] for dv in dsg.data_vars(): assert len(dv.dimensions) == 1 assert t_dim.name in dv.dimensions assert dv.size == t_dim.size else: # This `time` being two dimensional is unique to IncompleteMultidimensionalTrajectory assert len(t.dimensions) == 2 t_dim = dsg.dimensions[t.dimensions[0]] o_dim = dsg.dimensions[t.dimensions[1]] for dv in dsg.data_vars(): assert dv.size == t.size assert len(dv.dimensions) == 2 assert t_dim.name in dv.dimensions assert o_dim.name in dv.dimensions assert dv.size == t_dim.size * o_dim.size except BaseException: if strict is True: raise return False return True
def to_dataframe(self, clean_cols=True, clean_rows=True): # Z zvar = self.z_axes()[0] z = np.ma.fix_invalid(np.ma.MaskedArray(zvar[:])) z = z.flatten().round(5) logger.debug(['z data size: ', z.size]) # T tvar = self.t_axes()[0] t = np.ma.MaskedArray(nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard'))).flatten() # Patch the time variable back to its original mask, since num2date # breaks any missing/fill values if hasattr(tvar[0], 'mask'): t.mask = tvar[:].mask logger.debug(['time data size: ', t.size]) # X xvar = self.x_axes()[0] x = np.ma.fix_invalid(np.ma.MaskedArray(xvar[:])).flatten().round(5) logger.debug(['x data size: ', x.size]) # Y yvar = self.y_axes()[0] y = np.ma.fix_invalid(np.ma.MaskedArray(yvar[:])).flatten().round(5) logger.debug(['y data size: ', y.size]) # Trajectories pvar = self.filter_by_attrs(cf_role='trajectory_id')[0] try: p = normalize_array(pvar) except BaseException: logger.exception('Could not pull trajectory values from the variable, using indexes.') p = np.asarray(list(range(len(pvar))), dtype=np.integer) # The Dimension that the trajectory id variable doesn't have is what # the trajectory data needs to be repeated by dim_diff = self.dimensions[list(set(tvar.dimensions).difference(set(pvar.dimensions)))[0]] if dim_diff: p = p.repeat(dim_diff.size) logger.debug(['trajectory data size: ', p.size]) # Distance d = np.append([0], great_distance(start_latitude=y[0:-1], end_latitude=y[1:], start_longitude=x[0:-1], end_longitude=x[1:])['distance']) d = np.ma.fix_invalid(np.ma.MaskedArray(np.cumsum(d)).astype(np.float64).round(2)) logger.debug(['distance data size: ', d.size]) df_data = { 't': t, 'x': x, 'y': y, 'z': z, 'trajectory': p, 'distance': d } building_index_to_drop = np.ones(t.size, dtype=bool) extract_vars = list(set(self.data_vars() + self.ancillary_vars())) for i, dvar in enumerate(extract_vars): vdata = np.ma.fix_invalid(np.ma.MaskedArray(dvar[:].round(3).flatten())) building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True) # noqa df_data[dvar.name] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=False, clean_rows=False): # Don't pass around the attributes store them in the class # T tvar = self.t_axes()[0] t = nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard')) if isinstance(t, datetime): # Size one t = np.array([t.isoformat()], dtype='datetime64') logger.debug(['time data size: ', t.size]) svar = self.filter_by_attrs(cf_role='timeseries_id')[0] # Stations # TODO: Make sure there is a test for a file with multiple time variables try: s = normalize_array(svar) except ValueError: s = np.asarray(list(range(len(svar))), dtype=np.integer) s = np.repeat(s, t.size) logger.debug(['station data size: ', s.size]) # X xvar = self.x_axes()[0] x = generic_masked(xvar[:].repeat(t.size), attrs=self.vatts(xvar.name)).round(5) logger.debug(['x data size: ', x.size]) # Y yvar = self.y_axes()[0] y = generic_masked(yvar[:].repeat(t.size), attrs=self.vatts(yvar.name)).round(5) logger.debug(['y data size: ', y.size]) # Z zvar = self.z_axes()[0] z = generic_masked(zvar[:].repeat(t.size), attrs=self.vatts(zvar.name)) logger.debug(['z data size: ', z.size]) # now repeat t per station # figure out if this is a single-station file # do this by checking the dimensions of the Z var if zvar.ndim == 1: t = np.repeat(t, len(svar)) df_data = { 't': t, 'x': x, 'y': y, 'z': z, 'station': s, } #building_index_to_drop = np.ones(t.size, dtype=bool) extract_vars = copy(self.variables) del extract_vars[svar.name] del extract_vars[xvar.name] del extract_vars[yvar.name] del extract_vars[zvar.name] del extract_vars[tvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): if dvar[:].flatten().size > t.size: logger.warning("Variable {} is not the correct size, skipping.".format(dnam)) continue vdata = generic_masked(dvar[:].flatten(), attrs=self.vatts(dnam)) if vdata.size == 1: vdata = vdata[0] #building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True) # noqa try: if re.match(r'.* since .*',dvar.units): vdata = nc4.num2date(vdata[:], dvar.units, getattr(dvar, 'calendar', 'standard')) except AttributeError: pass df_data[dnam] = vdata #logger.info('{} - {}'.format(dnam, vdata.shape)) df = pd.DataFrame() for k, v in df_data.items(): df[k] = v # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data #if clean_rows: # df = df.iloc[~building_index_to_drop] return df